/* Function: p7_ParameterEstimation() * Incept: SRE, Sat Mar 24 10:15:37 2007 [Janelia] * * Purpose: Given an <hmm> containing collected, weighted counts; * and given a mixture Dirichlet prior <pri>; * calculate mean posterior parameter estimates for * all model parameters, converting the * HMM to a parameterized probabilistic model. * * Returns: <eslOK> on success. */ int p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri) { int k; double c[p7_MAXABET]; double p[p7_MAXABET]; double mix[p7_MAXDCHLET]; /* Match transitions 0,1..M: 0 is the B state * TMD at node M is 0. */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k], 3, c); esl_mixdchlet_MPParameters(c, 3, pri->tm, mix, p); esl_vec_D2F(p, 3, hmm->t[k]); } hmm->t[hmm->M][p7H_MD] = 0.0; esl_vec_FNorm(hmm->t[hmm->M], 3); /* Insert transitions, 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k]+3, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->ti, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+3); } /* Delete transitions, 1..M-1 * For k=0, which is unused; convention sets TMM=1.0, TMD=0.0 * For k=M, TMM = 1.0 (to the E state) and TMD=0.0 (no next D; must go to E). */ for (k = 1; k < hmm->M; k++) { esl_vec_F2D(hmm->t[k]+5, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->td, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+5); } hmm->t[0][p7H_DM] = hmm->t[hmm->M][p7H_DM] = 1.0; hmm->t[0][p7H_DD] = hmm->t[hmm->M][p7H_DD] = 0.0; /* Match emissions, 1..M * Convention sets mat[0] to a valid pvector: first elem 1, the rest 0. */ for (k = 1; k <= hmm->M; k++) { esl_vec_F2D(hmm->mat[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->em, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->mat[k]); } esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); hmm->mat[0][0] = 1.0; /* Insert emissions 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->ins[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->ei, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->ins[k]); } return eslOK; }
/* Function: p7_Seqmodel() * Synopsis: Make a profile HMM from a single sequence. * * Purpose: Make a profile HMM from a single sequence, for * probabilistic Smith/Waterman alignment, HMMER3-style. * * The query is digital sequence <dsq> of length <M> * residues in alphabet <abc>, named <name>. * * The scoring system is given by <Q>, <f>, <popen>, and * <pextend>. <Q> is a $K \times K$ matrix giving * conditional residue probabilities $P(a \mid b)}$; these * are typically obtained by reverse engineering a score * matrix like BLOSUM62. <f> is a vector of $K$ background * frequencies $p_a$. <popen> and <pextend> are the * probabilities assigned to gap-open ($t_{MI}$ and * $t_{MD}$) and gap-extend ($t_{II}$ and $t_{DD}$) * transitions. * * The <p7H_SINGLE> flag is set on the <hmm>. Model * configuration (<p7_profile_Config(), friends> detects * this flag. <B->Mk> entry transitions include a match * state occupancy term for profile HMMs, but for single * queries, that <occ[]> term is assumed 1.0 for all * positions. See commentary in modelconfig.c. * * Args: * * Returns: <eslOK> on success, and a newly allocated HMM is returned * in <ret_hmm>. * * Throws: <eslEMEM> on allocation error, and <*ret_hmm> is <NULL>. */ int p7_Seqmodel(const ESL_ALPHABET *abc, ESL_DSQ *dsq, int M, char *name, ESL_DMATRIX *Q, float *f, double popen, double pextend, P7_HMM **ret_hmm) { int status; P7_HMM *hmm = NULL; char *logmsg = "[HMM created from a query sequence]"; int k; if ((hmm = p7_hmm_Create(M, abc)) == NULL) { status = eslEMEM; goto ERROR; } for (k = 0; k <= M; k++) { /* Use rows of P matrix as source of match emission vectors */ if (k > 0) esl_vec_D2F(Q->mx[(int) dsq[k]], abc->K, hmm->mat[k]); /* Set inserts to background for now. This will be improved. */ esl_vec_FCopy(f, abc->K, hmm->ins[k]); hmm->t[k][p7H_MM] = 1.0 - 2 * popen; hmm->t[k][p7H_MI] = popen; hmm->t[k][p7H_MD] = popen; hmm->t[k][p7H_IM] = 1.0 - pextend; hmm->t[k][p7H_II] = pextend; hmm->t[k][p7H_DM] = 1.0 - pextend; hmm->t[k][p7H_DD] = pextend; } /* Deal w/ special stuff at node M, overwriting a little of what we * just did. */ hmm->t[M][p7H_MM] = 1.0 - popen; hmm->t[M][p7H_MD] = 0.; hmm->t[M][p7H_DM] = 1.0; hmm->t[M][p7H_DD] = 0.; /* Add mandatory annotation */ p7_hmm_SetName(hmm, name); p7_hmm_AppendComlog(hmm, 1, &logmsg); hmm->nseq = 1; p7_hmm_SetCtime(hmm); hmm->checksum = 0; hmm->flags |= p7H_SINGLE; *ret_hmm = hmm; return eslOK; ERROR: if (hmm != NULL) p7_hmm_Destroy(hmm); *ret_hmm = NULL; return status; }
/* The DChoose() and FChoose() unit tests. */ static void utest_choose(ESL_RANDOMNESS *r, int n, int nbins, int be_verbose) { double *pd = NULL; float *pf = NULL; int *ct = NULL; int i; double X2, diff, exp, X2p; if ((pd = malloc(sizeof(double) * nbins)) == NULL) esl_fatal("malloc failed"); if ((pf = malloc(sizeof(float) * nbins)) == NULL) esl_fatal("malloc failed"); if ((ct = malloc(sizeof(int) * nbins)) == NULL) esl_fatal("malloc failed"); /* Sample a random multinomial probability vector. */ if (esl_dirichlet_DSampleUniform(r, nbins, pd) != eslOK) esl_fatal("dirichlet sample failed"); esl_vec_D2F(pd, nbins, pf); /* Sample observed counts using DChoose(). */ esl_vec_ISet(ct, nbins, 0); for (i = 0; i < n; i++) ct[esl_rnd_DChoose(r, pd, nbins)]++; /* X^2 test on those observed counts. */ for (X2 = 0., i=0; i < nbins; i++) { exp = (double) n * pd[i]; diff = (double) ct[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed"); if (be_verbose) printf("DChoose(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); /* Repeat above for FChoose(). */ esl_vec_ISet(ct, nbins, 0); for (i = 0; i < n; i++) ct[esl_rnd_FChoose(r, pf, nbins)]++; for (X2 = 0., i=0; i < nbins; i++) { exp = (double) n * pd[i]; diff = (double) ct[i] - exp; X2 += diff*diff/exp; } if (esl_stats_ChiSquaredTest(nbins, X2, &X2p) != eslOK) esl_fatal("chi square eval failed"); if (be_verbose) printf("FChoose(): \t%g\n", X2p); if (X2p < 0.01) esl_fatal("chi squared test failed"); free(pd); free(pf); free(ct); return; }
static void utest_pvectors(void) { char *msg = "pvector unit test failed"; double p1[4] = { 0.25, 0.25, 0.25, 0.25 }; double p2[4]; double p3[4]; float p1f[4]; float p2f[4] = { 0.0, 0.5, 0.5, 0.0 }; float p3f[4]; int n = 4; double result; esl_vec_D2F(p1, n, p1f); esl_vec_F2D(p2f, n, p2); if (esl_vec_DValidate(p1, n, 1e-12, NULL) != eslOK) esl_fatal(msg); if (esl_vec_FValidate(p1f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p1, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p1f, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p2, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p2f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p2, p1, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FRelEntropy(p2f, p1f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p1, p2, n); if (result != eslINFINITY) esl_fatal(msg); result = esl_vec_FRelEntropy(p1f, p2f, n); if (result != eslINFINITY) esl_fatal(msg); esl_vec_DLog(p2, n); if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg); esl_vec_DExp(p2, n); if (p2[0] != 0.) esl_fatal(msg); esl_vec_FLog(p2f, n); if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); esl_vec_FExp(p2f, n); if (p2f[0] != 0.) esl_fatal(msg); esl_vec_DCopy(p2, n, p3); esl_vec_DScale(p3, n, 10.); esl_vec_DNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DLog(p3, n); result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DIncrement(p3, n, 2.0); esl_vec_DLogNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_FCopy(p2f, n, p3f); esl_vec_FScale(p3f, n, 10.); esl_vec_FNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FLog(p3f, n); result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FIncrement(p3f, n, 2.0); esl_vec_FLogNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); return; }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 2, argc, argv, banner, usage); ESL_ALPHABET *abc = esl_alphabet_Create(eslAMINO); char *qfile = esl_opt_GetArg(go, 1); char *tfile = esl_opt_GetArg(go, 2); ESL_SQFILE *qfp = NULL; ESL_SQFILE *tfp = NULL; ESL_SQ *qsq = esl_sq_CreateDigital(abc); ESL_SQ *tsq = esl_sq_CreateDigital(abc); ESL_SCOREMATRIX *S = esl_scorematrix_Create(abc); ESL_DMATRIX *Q = NULL; P7_BG *bg = p7_bg_Create(abc); P7_HMM *hmm = NULL; P7_PROFILE *gm = NULL; P7_REFMX *vit = p7_refmx_Create(200, 400); /* will grow as needed */ double *fa = malloc(sizeof(double) * abc->K); double popen = 0.02; double pextend = 0.4; double lambda; float vsc; float nullsc; int status; esl_composition_BL62(fa); esl_vec_D2F(fa, abc->K, bg->f); esl_scorematrix_Set("BLOSUM62", S); esl_scorematrix_ProbifyGivenBG(S, fa, fa, &lambda, &Q); esl_scorematrix_JointToConditionalOnQuery(abc, Q); if (esl_sqfile_OpenDigital(abc, qfile, eslSQFILE_UNKNOWN, NULL, &qfp) != eslOK) esl_fatal("failed to open %s", qfile); if (esl_sqio_Read(qfp, qsq) != eslOK) esl_fatal("failed to read query seq"); p7_Seqmodel(abc, qsq->dsq, qsq->n, qsq->name, Q, bg->f, popen, pextend, &hmm); p7_hmm_SetComposition(hmm); p7_hmm_SetConsensus(hmm, qsq); gm = p7_profile_Create(hmm->M, abc); p7_profile_ConfigUnilocal(gm, hmm, bg, 400); if (esl_sqfile_OpenDigital(abc, tfile, eslSQFILE_UNKNOWN, NULL, &tfp) != eslOK) esl_fatal("failed to open %s", tfile); while ((status = esl_sqio_Read(tfp, tsq)) == eslOK) { p7_bg_SetLength (bg, tsq->n); p7_profile_SetLength(gm, tsq->n); p7_ReferenceViterbi(tsq->dsq, tsq->n, gm, vit, NULL, &vsc); p7_bg_NullOne(bg, tsq->dsq, tsq->n, &nullsc); printf("%.4f %-25s %-25s\n", (vsc - nullsc) / eslCONST_LOG2, tsq->name, gm->name); esl_sq_Reuse(tsq); p7_refmx_Reuse(vit); } p7_refmx_Destroy(vit); p7_profile_Destroy(gm); p7_hmm_Destroy(hmm); p7_bg_Destroy(bg); esl_dmatrix_Destroy(Q); esl_scorematrix_Destroy(S); free(fa); esl_sq_Destroy(qsq); esl_sq_Destroy(tsq); esl_sqfile_Close(qfp); esl_sqfile_Close(tfp); esl_alphabet_Destroy(abc); esl_getopts_Destroy(go); return 0; }