/* Function: p7_GNull2_ByTrace() * Synopsis: Assign null2 scores to an envelope by the sampling method. * Incept: SRE, Thu May 1 10:00:43 2008 [Janelia] * * Purpose: Given a traceback <tr> for an alignment of model <gm> to * some target sequence; calculate null2 odds ratios $\frac{f'{x}}{f{x}}$ * as the state-usage-weighted emission probabilities, * with state usages calculated by counting emissions used * at positions <zstart..zend> in the trace. * * Because we only need to collect state usages from the * trace <tr>, the target sequence is irrelevant. Because * we are only averaging emission odds ratios from model * <gm>, the configuration of <gm> is irrelevant (uni * vs. multihit, or length config). * * Args: gm - model, in any configuration; only emission odds are used * tr - traceback for any region (or all) of a target sequence * zstart - first elem in <tr> to collect from; use 0 for complete * zend - last elem in <tr> to collect from; use tr->N-1 for complete * wrk - DP matrix w/ at least one row, for workspace * null2 - RESULT: odds ratios f'(x)/f(x) for all Kp residues * * Returns: <eslOK> on success, and the <ddef->n2sc> scores are set * for region <i..j>. * * Throws: <eslEMEM> on allocation error. */ int p7_GNull2_ByTrace(const P7_PROFILE *gm, const P7_TRACE *tr, int zstart, int zend, P7_GMX *wrk, float *null2) { float **dp = wrk->dp; /* so that {MDI}MX() macros work */ float *xmx = wrk->xmx; /* so that XMX() macro works */ int Ld = 0; int M = gm->M; int k; /* index over model position */ int x; /* index over residues */ int z; /* index over trace position */ float xfactor; /* We'll use the i=0 row in wrk for working space: dp[0][] and xmx[0..4]. */ esl_vec_FSet(wrk->dp[0], (M+1)*p7G_NSCELLS, 0.0); esl_vec_FSet(wrk->xmx, p7G_NXCELLS, 0.0); /* Calculate emitting state usage in this particular trace segment: */ for (z = zstart; z <= zend; z++) { switch (tr->st[z]) { case p7T_M: Ld++; MMX(0,tr->k[z]) += 1.0; break; case p7T_I: Ld++; IMX(0,tr->k[z]) += 1.0; break; case p7T_N: if (tr->st[z-1] == p7T_N) { Ld++; XMX(0,p7G_N) += 1.0; } break; case p7T_C: if (tr->st[z-1] == p7T_C) { Ld++; XMX(0,p7G_C) += 1.0; } break; case p7T_J: if (tr->st[z-1] == p7T_J) { Ld++; XMX(0,p7G_J) += 1.0; } break; } } esl_vec_FScale(wrk->dp[0], (M+1)*p7G_NSCELLS, (1.0 / (float) Ld)); esl_vec_FScale(wrk->xmx, p7G_NXCELLS, (1.0 / (float) Ld)); /* Calculate null2's odds ratio emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. */ esl_vec_FSet(null2, gm->abc->K, 0.0); xfactor = XMX(0,p7G_N) + XMX(0,p7G_C) + XMX(0,p7G_J); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] += MMX(0,k) * expf(p7P_MSC(gm, k, x)); null2[x] += IMX(0,k) * expf(p7P_ISC(gm, k, x)); } null2[x] += MMX(0,M) * expf(p7P_MSC(gm, M, x)); null2[x] += xfactor; } /* now null2[x] = \frac{f_d(x)}{f_0(x)} odds ratios for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: CPlan9SWConfig() * EPN 05.30.06 * based on SRE's Plan7SWConfig() from HMMER's plan7.c * * Purpose: Set the alignment independent parameters of * a CM Plan 9 model to hmmsw (Smith/Waterman) configuration. * * Notes: The desideratum for begin/end probs is that all fragments ij * (starting at match i, ending at match j) are * equiprobable -- there is no information in the choice of * entry/exit. There are M(M+1)/2 possible choices of ij, so * each must get a probability of 2/M(M+1). This prob is the * product of a begin, an end, and all the not-end probs in * the path between i,j. * * Thus: entry/exit is asymmetric because of the left/right * nature of the HMM/profile. Entry probability is distributed * simply by assigning p_x = pentry / (M-1) to M-1 * internal match states. However, the same approach doesn't * lead to a flat distribution over exit points. Exit p's * must be corrected for the probability of a previous exit * from the model. Requiring a flat distribution over exit * points leads to an easily solved piece of algebra, giving: * p_1 = pexit / (M-1) * p_x = p_1 / (1 - (x-1) p_1) * * Modified EPN, Thu Feb 7 15:54:16 2008, as follows: * To better match a locally configured CM, if <do_match_local_cm> * we disallow insertions before the first (emitting) match state, * (from I_0), and after the final (emitting) match state, * (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR * which can never be entered in a locally configured CM * (b/c the ROOT_S state MUST jump into a local begin state, which * are always match states>). Also we disallow a M_0->D_1 transition * because these would be impossible in a locally configured CM. * * <do_match_local_cm> is usually TRUE, unless we're configuring * the CP9 specifically for eventual sub CM alignment, where * the goal is simply find the most likely start/end point * of the alignment with this CP9 (in that case we want * I_0 and I_M reachable). * * Args: hmm - the CM Plan 9 model w/ data-dep prob's valid * pentry - probability of an internal entry somewhere; * will be evenly distributed over M-1 match states * pexit - probability of an internal exit somewhere; * will be distributed over M-1 match states. * do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable * to better match a locally configured CM. * first_cm_ndtype - only used if do_match_local_cm is TRUE * if it's MATL or MATP then D_1 should be unreachable (it is in the CM) * if it's MATR or MATP then D_M should be unreachable (it is in the CM) * * Return: (void) * HMM probabilities are modified. */ void CPlan9SWConfig(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype) { float basep; /* p1 for exits: the base p */ int k; /* counter over states */ float d; /* No special (*x* states in Plan 7) states in CM Plan 9 */ /*for (k = 1; k <= hmm->M; k++) printf("before anything: end[%d]: %f\n", k, hmm->end[k]);*/ /* Configure entry. */ if(do_match_local_cm) { hmm->t[0][CTMI] = 0.; hmm->t[0][CTMM] = 0.; /* already was 0.0, transition from M_0 to M_1 is begin[1] */ hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */ if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */ hmm->t[0][CTMD] = 0.; } hmm->t[hmm->M][CTMI] = 0.; hmm->t[hmm->M][CTDI] = 0.; if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */ hmm->t[hmm->M][CTMD] = 0.; } /* renormalize transitions out of M_M */ d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M]; esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d); hmm->end[hmm->M] /= d; /* renormalize transitions out of D_M */ esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL])); esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1)); /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin) * and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0. */ /* Configure exit. * Don't touch hmm->end[hmm->M] */ basep = pexit / (float) (hmm->M-1); for (k = 1; k < hmm->M; k++) hmm->end[k] = basep / (1. - basep * (float) (k-1)); CPlan9RenormalizeExits(hmm, 1); /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/ hmm->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */ hmm->flags |= CPLAN9_LOCAL_BEGIN; /* local begins now on */ hmm->flags |= CPLAN9_LOCAL_END; /* local ends now on */ CP9Logoddsify(hmm); }
/* Function: CPlan9Renormalize() * * Purpose: Take an HMM in counts form, and renormalize * all of its probability vectors. Also enforces * CM Plan9 restrictions on nonexistent transitions. * * Args: hmm - the model to renormalize. * * Return: (void) * hmm is changed. */ void CPlan9Renormalize(CP9_t *hmm) { int k; /* counter for model position */ float d; /* denominator */ /* match emissions */ esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); /*M_0 is B state, non-emitter*/ for (k = 1; k <= hmm->M; k++) esl_vec_FNorm(hmm->mat[k], hmm->abc->K); /* insert emissions */ for (k = 0; k <= hmm->M; k++) esl_vec_FNorm(hmm->ins[k], hmm->abc->K); /* begin transitions */ d = esl_vec_FSum(hmm->begin+1, hmm->M) + hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]; /* hmm->t[0][CTMEL] should always be 0., can't local end from the M_0 == B state */ esl_vec_FScale(hmm->begin+1, hmm->M, 1./d); hmm->t[0][CTMI] /= d; hmm->t[0][CTMD] /= d; hmm->t[0][CTMEL] /= d; esl_vec_FNorm(hmm->t[0] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* transitions out of insert for node 0 (state N)*/ esl_vec_FSet (hmm->t[0] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE, 0.); /* main model transitions */ for (k = 1; k <= hmm->M; k++) /* safe for node M too, hmm->t[hmm->M][CTMM] should be 0.*/ { d = esl_vec_FSum(hmm->t[k], cp9_TRANS_NMATCH) + hmm->end[k]; esl_vec_FScale(hmm->t[k], cp9_TRANS_NMATCH, 1./d); hmm->end[k] /= d; esl_vec_FNorm(hmm->t[k] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* insert */ esl_vec_FNorm(hmm->t[k] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } /* null model emissions */ esl_vec_FNorm(hmm->null, hmm->abc->K); hmm->flags &= ~CPLAN9_HASBITS; /* clear the log-odds ready flag */ hmm->flags |= CPLAN9_HASPROB; /* set the probabilities OK flag */ }
/* Function: p7_masstrace_FinishCount() * Synopsis: Convert counted histograms to cumulative endpoint prob distributions. * * Purpose: We've finished collecting endpoints from traces with * <_CountTrace()> in <mt>, <ntr> of which had the * specified domain anchor; now convert the counts to * <mt>'s cumulative probability distributions. * * Args: mt - mass trace object we've collected endpoint counts in * ntr - number of traces we counted into <mt> that contained the domain anchor * * Returns: <eslOK> on success; <mt> is now a valid <P7_MASSTRACE> object * containing envelope endpoint cumulative probability distributions. */ int p7_masstrace_FinishCount(P7_MASSTRACE *mt, int ntr) { int i,k; ESL_DASSERT1( (ntr > 0) ); ESL_DASSERT1( (mt->i0) ); ESL_DASSERT1( (mt->k0) ); ESL_DASSERT1( (p7_trace_IsMain(mt->st0)) ); if (mt->imass) { for (i = 1; i < mt->i0; i++) mt->imass[i] += mt->imass[i-1]; for (i = mt->L; i > mt->i0; i--) mt->imass[i] += mt->imass[i+1]; esl_vec_FScale(mt->imass+1, mt->L, 1./(float) ntr); mt->imass[mt->i0] = 1.; } for (k = 1; k < mt->k0; k++) mt->kmass[k] += mt->kmass[k-1]; for (k = mt->M; k > mt->k0; k--) mt->kmass[k] += mt->kmass[k+1]; esl_vec_FScale(mt->kmass+1, mt->M, 1./(float) ntr); mt->kmass[mt->k0] = 1.; return eslOK; }
/* Function: CPlan9RenormalizeExits() * EPN 05.30.06 based on SRE's Plan7RenormalizeExits() from * HMMER's plan7.c. * * Date: SRE, Fri Aug 14 11:22:19 1998 [St. Louis] * * Purpose: Renormalize just the match state transitions; * for instance, after a Config() function has * modified the exit distribution. * * Args: hmm - hmm to renormalize * spos - first consensus column modelled by original * CP9 HMM the sub CP9 HMM models. Often 1. * Returns: void */ void CPlan9RenormalizeExits(CP9_t *hmm, int spos) { int k; float d; /* We can't exit from node 0 so we start renormalizing at node 1 */ for (k = 1; k < hmm->M; k++) { if(k != (spos-1)) /* we can't exit from the M_spos-1 */ { d = esl_vec_FSum(hmm->t[k], 4); /* esl_vec_FScale(hmm->t[k], 4, 1./(d + d*hmm->end[k])); */ esl_vec_FScale(hmm->t[k], 4, (1.-hmm->end[k])/d); } } /* Take care of hmm->M node, which is special */ d = hmm->t[hmm->M][CTMI] + hmm->t[hmm->M][CTMEL]; /* CTMD is IMPOSSIBLE, CTMM is hmm->end[hmm-M] */ if(! (fabs(d-0.) < eslSMALLX1)) { /* don't divide by d if it's zero */ hmm->t[hmm->M][CTMI] *= (1.-hmm->end[hmm->M])/d; hmm->t[hmm->M][CTMEL] *= (1.-hmm->end[hmm->M])/d; } return; }
static void utest_pvectors(void) { char *msg = "pvector unit test failed"; double p1[4] = { 0.25, 0.25, 0.25, 0.25 }; double p2[4]; double p3[4]; float p1f[4]; float p2f[4] = { 0.0, 0.5, 0.5, 0.0 }; float p3f[4]; int n = 4; double result; esl_vec_D2F(p1, n, p1f); esl_vec_F2D(p2f, n, p2); if (esl_vec_DValidate(p1, n, 1e-12, NULL) != eslOK) esl_fatal(msg); if (esl_vec_FValidate(p1f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p1, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p1f, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p2, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p2f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p2, p1, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FRelEntropy(p2f, p1f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p1, p2, n); if (result != eslINFINITY) esl_fatal(msg); result = esl_vec_FRelEntropy(p1f, p2f, n); if (result != eslINFINITY) esl_fatal(msg); esl_vec_DLog(p2, n); if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg); esl_vec_DExp(p2, n); if (p2[0] != 0.) esl_fatal(msg); esl_vec_FLog(p2f, n); if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); esl_vec_FExp(p2f, n); if (p2f[0] != 0.) esl_fatal(msg); esl_vec_DCopy(p2, n, p3); esl_vec_DScale(p3, n, 10.); esl_vec_DNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DLog(p3, n); result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DIncrement(p3, n, 2.0); esl_vec_DLogNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_FCopy(p2f, n, p3f); esl_vec_FScale(p3f, n, 10.); esl_vec_FNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FLog(p3f, n); result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FIncrement(p3f, n, 2.0); esl_vec_FLogNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); return; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; float *sqd; int status; int nbad; int nali = 0; int nbadali = 0; int nwgt = 0; int nbadwgt = 0; int i; int be_quiet; int do_gsc; int do_pb; int do_blosum; double maxid; double tol; int maxN; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } be_quiet = esl_opt_GetBoolean(go, "-q"); do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); tol = esl_opt_GetReal (go, "--tol"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } msafile = esl_opt_GetArg(go, 1); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } nali++; nwgt += msa->nseq; ESL_ALLOC(sqd, sizeof(float) * msa->nseq); if (do_gsc) { esl_msaweight_GSC(msa); GSCWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_pb) { esl_msaweight_PB(msa); PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_blosum) { esl_msaweight_BLOSUM(msa, maxid); BlosumWeights(msa->aseq, msa->nseq, msa->alen, maxid, sqd); /* workaround SQUID bug: BLOSUM weights weren't renormalized to sum to nseq. */ esl_vec_FNorm (sqd, msa->nseq); esl_vec_FScale(sqd, msa->nseq, (float) msa->nseq); } if (! be_quiet) { for (i = 0; i < msa->nseq; i++) fprintf(stdout, "%-20s %.3f %.3f\n", msa->sqname[i], msa->wgt[i], sqd[i]); } nbad = 0; for (i = 0; i < msa->nseq; i++) if (esl_DCompare((double) sqd[i], msa->wgt[i], tol) != eslOK) nbad++; if (nbad > 0) nbadali++; nbadwgt += nbad; if (nbad > 0) printf("%-20s :: alignment shows %d weights that differ (out of %d) \n", msa->name, nbad, msa->nseq); esl_msa_Destroy(msa); free(sqd); } eslx_msafile_Close(afp); if (nbadali == 0) printf("OK: all weights identical between squid and Easel in %d alignment(s)\n", nali); else { printf("%d of %d weights mismatched at (> %f fractional difference)\n", nbadwgt, nwgt, tol); printf("involving %d of %d total alignments\n", nbadali, nali); } return eslOK; ERROR: return status; }