/* Function: p7_ParameterEstimation() * Incept: SRE, Sat Mar 24 10:15:37 2007 [Janelia] * * Purpose: Given an <hmm> containing collected, weighted counts; * and given a mixture Dirichlet prior <pri>; * calculate mean posterior parameter estimates for * all model parameters, converting the * HMM to a parameterized probabilistic model. * * Returns: <eslOK> on success. */ int p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri) { int k; double c[p7_MAXABET]; double p[p7_MAXABET]; double mix[p7_MAXDCHLET]; /* Match transitions 0,1..M: 0 is the B state * TMD at node M is 0. */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k], 3, c); esl_mixdchlet_MPParameters(c, 3, pri->tm, mix, p); esl_vec_D2F(p, 3, hmm->t[k]); } hmm->t[hmm->M][p7H_MD] = 0.0; esl_vec_FNorm(hmm->t[hmm->M], 3); /* Insert transitions, 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k]+3, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->ti, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+3); } /* Delete transitions, 1..M-1 * For k=0, which is unused; convention sets TMM=1.0, TMD=0.0 * For k=M, TMM = 1.0 (to the E state) and TMD=0.0 (no next D; must go to E). */ for (k = 1; k < hmm->M; k++) { esl_vec_F2D(hmm->t[k]+5, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->td, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+5); } hmm->t[0][p7H_DM] = hmm->t[hmm->M][p7H_DM] = 1.0; hmm->t[0][p7H_DD] = hmm->t[hmm->M][p7H_DD] = 0.0; /* Match emissions, 1..M * Convention sets mat[0] to a valid pvector: first elem 1, the rest 0. */ for (k = 1; k <= hmm->M; k++) { esl_vec_F2D(hmm->mat[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->em, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->mat[k]); } esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); hmm->mat[0][0] = 1.0; /* Insert emissions 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->ins[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->ei, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->ins[k]); } return eslOK; }
/* Function: p7_builder_SetScoreSystem() * Synopsis: Initialize score system for single sequence queries. * * Purpose: Initialize the builder <bld> to be able to parameterize * single sequence queries, using a substitution matrix * from a file. * * Read a standard substitution score matrix from file * <mxfile>. If <mxfile> is <NULL>, default to BLOSUM62 * scores. If <mxfile> is "-", read score matrix from * <stdin> stream. If <env> is non-<NULL> and <mxfile> is * not found in the current working directory, look for * <mxfile> in colon-delimited directory list contained in * environment variable <env>. * * Set the gap-open and gap-extend probabilities to * <popen>, <pextend>, respectively. * * Use background residue frequencies in the null model * <bg> to convert substitution matrix scores to * conditional probability parameters. * * Args: bld - <P7_BUILDER> to initialize * mxfile - score matrix file to use, or NULL for BLOSUM62 default * env - env variable containing directory list where <mxfile> may reside * popen - gap open probability * pextend - gap extend probability * bg - null model, containing background frequencies * * Returns: <eslOK> on success. * * <eslENOTFOUND> if <mxfile> can't be found or opened, even * in any of the directories specified by the <env> variable. * * <eslEINVAL> if the score matrix can't be converted into * conditional probabilities; for example, if it has no valid * solution for <lambda>. * * On either error, <bld->errbuf> contains a useful error message * for the user. * * Throws: <eslEMEM> on allocation failure. */ int p7_builder_SetScoreSystem(P7_BUILDER *bld, const char *mxfile, const char *env, double popen, double pextend, P7_BG *bg) { ESL_FILEPARSER *efp = NULL; double *f = NULL; double slambda; int status; bld->errbuf[0] = '\0'; /* If a score system is already set, delete it. */ if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); /* Get the scoring matrix */ if ((bld->S = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; } if (mxfile == NULL) { if (bld->abc->type == eslAMINO) { if ((status = esl_scorematrix_Set("BLOSUM62", bld->S)) != eslOK) goto ERROR; } else { if ((status = esl_scorematrix_Set("DNA1", bld->S)) != eslOK) goto ERROR; } } else { if ((status = esl_fileparser_Open(mxfile, env, &efp)) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to find or open matrix file %s", mxfile); if ((status = esl_scorematrix_Read(efp, bld->abc, &(bld->S))) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to read matrix from %s:\n%s", mxfile, efp->errbuf); esl_fileparser_Close(efp); efp = NULL; } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(f, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, f); /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f. */ status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q)); if (status == eslEINVAL) ESL_XFAIL(eslEINVAL, bld->errbuf, "input score matrix %s has no valid solution for lambda", mxfile); else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve input score matrix %s for lambda: are you sure it's valid?", mxfile); else if (status != eslOK) ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving input score matrix %s for probability parameters", mxfile); /* Convert joint probabilities P(ab) to conditionals P(b|a) */ esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q); bld->popen = popen; bld->pextend = pextend; free(f); return eslOK; ERROR: if (efp) esl_fileparser_Close(efp); if (f) free(f); return status; }
/* Function: p7_builder_LoadScoreSystem() * Synopsis: Load a standard score system for single sequence queries. * * Purpose: Initialize the builder <bld> to be able to parameterize * single sequence queries, using the standard (built-in) score * matrix named <mx>. * * Available score matrices <mx> include PAM30, 70, 120, and 240; * and BLOSUM45, 50, 62, 80, and 90. See <esl_scorematrix.c>. * * Set the gap-open and gap-extend probabilities to * <popen>, <pextend>, respectively. * * Use background residue frequencies in the null model * <bg> to convert substitution matrix scores to * conditional probability parameters. * * Args: bld - <P7_BUILDER> to initialize * matrix - score matrix file to use * popen - gap open probability * pextend - gap extend probability * bg - null model, containing background frequencies * * Returns: <eslOK> on success. * * <eslENOTFOUND> if <mxfile> can't be found or opened, even * in any of the directories specified by the <env> variable. * * <eslEINVAL> if the score matrix can't be converted into * conditional probabilities; for example, if it has no valid * solution for <lambda>. * * On either error, <bld->errbuf> contains a useful error message * for the user. * * Throws: <eslEMEM> on allocation failure. */ int p7_builder_LoadScoreSystem(P7_BUILDER *bld, const char *matrix, double popen, double pextend, P7_BG *bg) { double *f = NULL; double slambda; int status; bld->errbuf[0] = '\0'; /* If a score system is already set, delete it. */ if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); /* Get the scoring matrix */ if ((bld->S = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; } status = esl_scorematrix_Set(matrix, bld->S); if (status == eslENOTFOUND) ESL_XFAIL(status, bld->errbuf, "no matrix named %s is available as a built-in", matrix); else if (status != eslOK) ESL_XFAIL(status, bld->errbuf, "failed to set score matrix %s as a built-in", matrix); /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(f, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, f); /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f. */ /* Failures shouldn't happen here: these are standard matrices. */ status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q)); if (status == eslEINVAL) ESL_XFAIL(eslEINVAL, bld->errbuf, "built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving score matrix %s for probability parameters", matrix); /* Convert joint probabilities P(ab) to conditionals P(b|a) */ esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q); bld->popen = popen; bld->pextend = pextend; free(f); return eslOK; ERROR: if (f) free(f); return status; }
static void utest_pvectors(void) { char *msg = "pvector unit test failed"; double p1[4] = { 0.25, 0.25, 0.25, 0.25 }; double p2[4]; double p3[4]; float p1f[4]; float p2f[4] = { 0.0, 0.5, 0.5, 0.0 }; float p3f[4]; int n = 4; double result; esl_vec_D2F(p1, n, p1f); esl_vec_F2D(p2f, n, p2); if (esl_vec_DValidate(p1, n, 1e-12, NULL) != eslOK) esl_fatal(msg); if (esl_vec_FValidate(p1f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p1, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p1f, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p2, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p2f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p2, p1, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FRelEntropy(p2f, p1f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p1, p2, n); if (result != eslINFINITY) esl_fatal(msg); result = esl_vec_FRelEntropy(p1f, p2f, n); if (result != eslINFINITY) esl_fatal(msg); esl_vec_DLog(p2, n); if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg); esl_vec_DExp(p2, n); if (p2[0] != 0.) esl_fatal(msg); esl_vec_FLog(p2f, n); if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); esl_vec_FExp(p2f, n); if (p2f[0] != 0.) esl_fatal(msg); esl_vec_DCopy(p2, n, p3); esl_vec_DScale(p3, n, 10.); esl_vec_DNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DLog(p3, n); result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DIncrement(p3, n, 2.0); esl_vec_DLogNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_FCopy(p2f, n, p3f); esl_vec_FScale(p3f, n, 10.); esl_vec_FNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FLog(p3f, n); result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FIncrement(p3f, n, 2.0); esl_vec_FLogNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); return; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *hmmfile = esl_opt_GetArg(go, 1); char *qfile = esl_opt_GetArg(go, 2); ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQFILE *qfp = NULL; FILE *hmmfp = NULL; ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; double *fa = NULL; double popen = esl_opt_GetReal (go, "-q"); double pextend = esl_opt_GetReal (go, "-r"); char *mxfile = esl_opt_GetString(go, "-m"); char errbuf[eslERRBUFSIZE]; double slambda; int a,b; int status; /* Reverse engineer a scoring matrix to obtain conditional prob's * that we'll use for the single-seq query HMM. Because score mx is * symmetric, we can set up P[a][b] = P(b | a), so we can use the * matrix rows as HMM match emission vectors. This means dividing * the joint probs through by f_a. */ if (mxfile == NULL) { if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores"); } else { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(fa, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, fa); /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */ status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); if (status == eslEINVAL) esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix); esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Open the query sequence file in FASTA format */ status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp); if (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile); else if (status == eslEFORMAT) esl_fatal("Format of %s unrecognized.", qfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open of %s failed, code %d.", qfile, status); /* Open the output HMM file */ if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile); /* For each sequence, build a model and save it. */ while ((status = esl_sqio_Read(qfp, qsq)) == eslOK) { p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); if ( p7_hmm_Validate(hmm, errbuf, 1e-5) != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf); if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm) != eslOK) esl_fatal("HMM save failed"); p7_hmm_Destroy(hmm); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", qfp->filename, qfp->linenumber, qfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, qfp->filename); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); free(fb); esl_sq_Destroy(qsq); esl_sqfile_Close(qfp); fclose(hmmfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }