/* Function: p7_builder_SetScoreSystem() * Synopsis: Initialize score system for single sequence queries. * * Purpose: Initialize the builder <bld> to be able to parameterize * single sequence queries, using a substitution matrix * from a file. * * Read a standard substitution score matrix from file * <mxfile>. If <mxfile> is <NULL>, default to BLOSUM62 * scores. If <mxfile> is "-", read score matrix from * <stdin> stream. If <env> is non-<NULL> and <mxfile> is * not found in the current working directory, look for * <mxfile> in colon-delimited directory list contained in * environment variable <env>. * * Set the gap-open and gap-extend probabilities to * <popen>, <pextend>, respectively. * * Use background residue frequencies in the null model * <bg> to convert substitution matrix scores to * conditional probability parameters. * * Args: bld - <P7_BUILDER> to initialize * mxfile - score matrix file to use, or NULL for BLOSUM62 default * env - env variable containing directory list where <mxfile> may reside * popen - gap open probability * pextend - gap extend probability * bg - null model, containing background frequencies * * Returns: <eslOK> on success. * * <eslENOTFOUND> if <mxfile> can't be found or opened, even * in any of the directories specified by the <env> variable. * * <eslEINVAL> if the score matrix can't be converted into * conditional probabilities; for example, if it has no valid * solution for <lambda>. * * On either error, <bld->errbuf> contains a useful error message * for the user. * * Throws: <eslEMEM> on allocation failure. */ int p7_builder_SetScoreSystem(P7_BUILDER *bld, const char *mxfile, const char *env, double popen, double pextend, P7_BG *bg) { ESL_FILEPARSER *efp = NULL; double *f = NULL; double slambda; int status; bld->errbuf[0] = '\0'; /* If a score system is already set, delete it. */ if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); /* Get the scoring matrix */ if ((bld->S = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; } if (mxfile == NULL) { if (bld->abc->type == eslAMINO) { if ((status = esl_scorematrix_Set("BLOSUM62", bld->S)) != eslOK) goto ERROR; } else { if ((status = esl_scorematrix_Set("DNA1", bld->S)) != eslOK) goto ERROR; } } else { if ((status = esl_fileparser_Open(mxfile, env, &efp)) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to find or open matrix file %s", mxfile); if ((status = esl_scorematrix_Read(efp, bld->abc, &(bld->S))) != eslOK) ESL_XFAIL(status, bld->errbuf, "Failed to read matrix from %s:\n%s", mxfile, efp->errbuf); esl_fileparser_Close(efp); efp = NULL; } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(f, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, f); /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f. */ status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q)); if (status == eslEINVAL) ESL_XFAIL(eslEINVAL, bld->errbuf, "input score matrix %s has no valid solution for lambda", mxfile); else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve input score matrix %s for lambda: are you sure it's valid?", mxfile); else if (status != eslOK) ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving input score matrix %s for probability parameters", mxfile); /* Convert joint probabilities P(ab) to conditionals P(b|a) */ esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q); bld->popen = popen; bld->pextend = pextend; free(f); return eslOK; ERROR: if (efp) esl_fileparser_Close(efp); if (f) free(f); return status; }
/* Function: p7_builder_LoadScoreSystem() * Synopsis: Load a standard score system for single sequence queries. * * Purpose: Initialize the builder <bld> to be able to parameterize * single sequence queries, using the standard (built-in) score * matrix named <mx>. * * Available score matrices <mx> include PAM30, 70, 120, and 240; * and BLOSUM45, 50, 62, 80, and 90. See <esl_scorematrix.c>. * * Set the gap-open and gap-extend probabilities to * <popen>, <pextend>, respectively. * * Use background residue frequencies in the null model * <bg> to convert substitution matrix scores to * conditional probability parameters. * * Args: bld - <P7_BUILDER> to initialize * matrix - score matrix file to use * popen - gap open probability * pextend - gap extend probability * bg - null model, containing background frequencies * * Returns: <eslOK> on success. * * <eslENOTFOUND> if <mxfile> can't be found or opened, even * in any of the directories specified by the <env> variable. * * <eslEINVAL> if the score matrix can't be converted into * conditional probabilities; for example, if it has no valid * solution for <lambda>. * * On either error, <bld->errbuf> contains a useful error message * for the user. * * Throws: <eslEMEM> on allocation failure. */ int p7_builder_LoadScoreSystem(P7_BUILDER *bld, const char *matrix, double popen, double pextend, P7_BG *bg) { double *f = NULL; double slambda; int status; bld->errbuf[0] = '\0'; /* If a score system is already set, delete it. */ if (bld->S != NULL) esl_scorematrix_Destroy(bld->S); if (bld->Q != NULL) esl_dmatrix_Destroy(bld->Q); /* Get the scoring matrix */ if ((bld->S = esl_scorematrix_Create(bld->abc)) == NULL) { status = eslEMEM; goto ERROR; } status = esl_scorematrix_Set(matrix, bld->S); if (status == eslENOTFOUND) ESL_XFAIL(status, bld->errbuf, "no matrix named %s is available as a built-in", matrix); else if (status != eslOK) ESL_XFAIL(status, bld->errbuf, "failed to set score matrix %s as a built-in", matrix); /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(f, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, f); /* Backcalculate joint probability matrix Q, given scores S and background freqs bg->f. */ /* Failures shouldn't happen here: these are standard matrices. */ status = esl_scorematrix_ProbifyGivenBG(bld->S, f, f, &slambda, &(bld->Q)); if (status == eslEINVAL) ESL_XFAIL(eslEINVAL, bld->errbuf, "built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) ESL_XFAIL(eslEINVAL, bld->errbuf, "failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) ESL_XFAIL(eslEINVAL, bld->errbuf, "unexpected error in solving score matrix %s for probability parameters", matrix); /* Convert joint probabilities P(ab) to conditionals P(b|a) */ esl_scorematrix_JointToConditionalOnQuery(bld->abc, bld->Q); bld->popen = popen; bld->pextend = pextend; free(f); return eslOK; ERROR: if (f) free(f); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *qfile = esl_opt_GetArg(go, 1); char *tfile = esl_opt_GetArg(go, 2); ESL_SQFILE *qfp = NULL; ESL_SQFILE *tfp = NULL; ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQ *tsq = esl_sq_CreateDigital(abc); ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_REFMX *vit = p7_refmx_Create(200, 400); /* will grow as needed */ double *fa = malloc(sizeof(double) * abc->K); double popen = 0.02; double pextend = 0.4; double lambda; float vsc; float nullsc; int status; esl_composition_BL62(fa); esl_vec_D2F(fa, abc->K, bg->f); esl_scorematrix_Set("BLOSUM62", S); esl_scorematrix_ProbifyGivenBG(S, fa, fa, &lambda, &Q); esl_scorematrix_JointToConditionalOnQuery(abc, Q); if (esl_sqfile_OpenDigital(abc, qfile, eslSQFILE_UNKNOWN, NULL, &qfp) != eslOK) esl_fatal("failed to open %s", qfile); if (esl_sqio_Read(qfp, qsq) != eslOK) esl_fatal("failed to read query seq"); p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); p7_hmm_SetComposition(hmm); p7_hmm_SetConsensus(hmm, qsq); gm = p7_profile_Create(hmm->M, abc); p7_profile_ConfigUnilocal(gm, hmm, bg, 400); if (esl_sqfile_OpenDigital(abc, tfile, eslSQFILE_UNKNOWN, NULL, &tfp) != eslOK) esl_fatal("failed to open %s", tfile); while ((status = esl_sqio_Read(tfp, tsq)) == eslOK) { p7_bg_SetLength (bg, tsq->n); p7_profile_SetLength(gm, tsq->n); p7_ReferenceViterbi(tsq->dsq, tsq->n, gm, vit, NULL, &vsc); p7_bg_NullOne(bg, tsq->dsq, tsq->n, &nullsc); printf("%.4f %-25s %-25s\n", (vsc - nullsc) / eslCONST_LOG2, tsq->name, gm->name); esl_sq_Reuse(tsq); p7_refmx_Reuse(vit); } p7_refmx_Destroy(vit); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); p7_bg_Destroy(bg); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); esl_sq_Destroy(qsq); esl_sq_Destroy(tsq); esl_sqfile_Close(qfp); esl_sqfile_Close(tfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *hmmfile = esl_opt_GetArg(go, 1); char *qfile = esl_opt_GetArg(go, 2); ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQFILE *qfp = NULL; FILE *hmmfp = NULL; ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; double *fa = NULL; double popen = esl_opt_GetReal (go, "-q"); double pextend = esl_opt_GetReal (go, "-r"); char *mxfile = esl_opt_GetString(go, "-m"); char errbuf[eslERRBUFSIZE]; double slambda; int a,b; int status; /* Reverse engineer a scoring matrix to obtain conditional prob's * that we'll use for the single-seq query HMM. Because score mx is * symmetric, we can set up P[a][b] = P(b | a), so we can use the * matrix rows as HMM match emission vectors. This means dividing * the joint probs through by f_a. */ if (mxfile == NULL) { if (esl_scorematrix_Set("BLOSUM62", S) != eslOK) esl_fatal("failed to set BLOSUM62 scores"); } else { ESL_FILEPARSER *efp = NULL; if ( esl_fileparser_Open(mxfile, NULL, &efp) != eslOK) esl_fatal("failed to open score file %s", mxfile); if ( esl_scorematrix_Read(efp, abc, &S) != eslOK) esl_fatal("failed to read matrix from %s", mxfile); esl_fileparser_Close(efp); } /* A wasteful conversion of the HMMER single-precision background probs to Easel double-prec */ ESL_ALLOC(fa, sizeof(double) * bg->abc->K); esl_vec_F2D(bg->f, bg->abc->K, fa); /* Backcalculate joint probabilities Q, given score matrix S and background frequencies fa */ status = esl_scorematrix_ProbifyGivenBG(S, fa, fa, &slambda, &Q); if (status == eslEINVAL) esl_fatal("built-in score matrix %s has no valid solution for lambda", matrix); else if (status == eslENOHALT) esl_fatal("failed to solve score matrix %s for lambda", matrix); else if (status != eslOK) esl_fatal("unexpected error in solving score matrix %s for probability parameters", matrix); esl_scorematrix_JointToConditionalOnQuery(abc, Q); /* Open the query sequence file in FASTA format */ status = esl_sqfile_Open(qfile, eslSQFILE_FASTA, NULL, &qfp); if (status == eslENOTFOUND) esl_fatal("No such file %s.", qfile); else if (status == eslEFORMAT) esl_fatal("Format of %s unrecognized.", qfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open of %s failed, code %d.", qfile, status); /* Open the output HMM file */ if ((hmmfp = fopen(hmmfile, "w")) == NULL) esl_fatal("Failed to open output HMM file %s", hmmfile); /* For each sequence, build a model and save it. */ while ((status = esl_sqio_Read(qfp, qsq)) == eslOK) { p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); if ( p7_hmm_Validate(hmm, errbuf, 1e-5) != eslOK) esl_fatal("HMM validation failed: %s\n", errbuf); if ( p7_hmmfile_WriteASCII(hmmfp, -1, hmm) != eslOK) esl_fatal("HMM save failed"); p7_hmm_Destroy(hmm); } if (status == eslEFORMAT) esl_fatal("Parse failed (sequence file %s line %" PRId64 "):\n%s\n", qfp->filename, qfp->linenumber, qfp->errbuf); else if (status != eslEOF) esl_fatal("Unexpected error %d reading sequence file %s", status, qfp->filename); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); free(fb); esl_sq_Destroy(qsq); esl_sqfile_Close(qfp); fclose(hmmfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }