/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */ static inline int select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; __m128 zerov = _mm_setzero_ps(); __m128 mpv, dpv; __m128 tmdv, tddv; union { __m128 v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_D }; if (q > 0) { mpv = ox->dpf[i][(q-1)*3 + p7X_M]; dpv = ox->dpf[i][(q-1)*3 + p7X_D]; tmdv = om->tfv[7*(q-1) + p7O_MD]; tddv = om->tfv[7*Q + (q-1)]; } else { mpv = esl_sse_rightshift_ps(ox->dpf[i][(Q-1)*3 + p7X_M], zerov); dpv = esl_sse_rightshift_ps(ox->dpf[i][(Q-1)*3 + p7X_D], zerov); tmdv = esl_sse_rightshift_ps(om->tfv[7*(Q-1) + p7O_MD], zerov); tddv = esl_sse_rightshift_ps(om->tfv[8*Q-1], zerov); } u.v = _mm_mul_ps(mpv, tmdv); path[0] = u.p[r]; u.v = _mm_mul_ps(dpv, tddv); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
/* D(i,k) is reached from M(i, k-1) or D(i,k-1). */ static inline int select_d(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; vector float mpv, dpv; vector float tmdv, tddv; union { vector float v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_D }; zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i][(q-1)*3 + p7X_M]; dpv = ox->dpf[i][(q-1)*3 + p7X_D]; tmdv = om->tfv[7*(q-1) + p7O_MD]; tddv = om->tfv[7*Q + (q-1)]; } else { mpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i][(Q-1)*3 + p7X_D], 12); tmdv = vec_sld(zerov, om->tfv[7*(Q-1) + p7O_MD], 12); tddv = vec_sld(zerov, om->tfv[8*Q-1], 12); } u.v = vec_madd(mpv, tmdv, zerov); path[0] = u.p[r]; u.v = vec_madd(dpv, tddv, zerov); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */ static inline int select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell M(i,k) */ int r = (k-1) / Q; __m128 *tp = om->tfv + 7*q; /* *tp now at start of transitions to cur cell M(i,k) */ __m128 xBv = _mm_set1_ps(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]); __m128 zerov = _mm_setzero_ps(); __m128 mpv, dpv, ipv; union { __m128 v; float p[4]; } u; float path[4]; int state[4] = { p7T_B, p7T_M, p7T_I, p7T_D }; if (q > 0) { mpv = ox->dpf[i-1][(q-1)*3 + p7X_M]; dpv = ox->dpf[i-1][(q-1)*3 + p7X_D]; ipv = ox->dpf[i-1][(q-1)*3 + p7X_I]; } else { mpv = esl_sse_rightshift_ps(ox->dpf[i-1][(Q-1)*3 + p7X_M], zerov); dpv = esl_sse_rightshift_ps(ox->dpf[i-1][(Q-1)*3 + p7X_D], zerov); ipv = esl_sse_rightshift_ps(ox->dpf[i-1][(Q-1)*3 + p7X_I], zerov); } u.v = _mm_mul_ps(xBv, *tp); tp++; path[0] = u.p[r]; u.v = _mm_mul_ps(mpv, *tp); tp++; path[1] = u.p[r]; u.v = _mm_mul_ps(ipv, *tp); tp++; path[2] = u.p[r]; u.v = _mm_mul_ps(dpv, *tp); path[3] = u.p[r]; esl_vec_FNorm(path, 4); return state[esl_rnd_FChoose(rng, path, 4)]; }
/* M(i,k) is reached from B(i-1), M(i-1,k-1), D(i-1,k-1), or I(i-1,k-1). */ static inline int select_m(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell M(i,k) */ int r = (k-1) / Q; vector float *tp = om->tfv + 7*q; /* *tp now at start of transitions to cur cell M(i,k) */ vector float xBv; vector float zerov; vector float mpv, dpv, ipv; union { vector float v; float p[4]; } u; float path[4]; int state[4] = { p7T_B, p7T_M, p7T_I, p7T_D }; xBv = esl_vmx_set_float(ox->xmx[(i-1)*p7X_NXCELLS+p7X_B]); zerov = (vector float) vec_splat_u32(0); if (q > 0) { mpv = ox->dpf[i-1][(q-1)*3 + p7X_M]; dpv = ox->dpf[i-1][(q-1)*3 + p7X_D]; ipv = ox->dpf[i-1][(q-1)*3 + p7X_I]; } else { mpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_M], 12); dpv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_D], 12); ipv = vec_sld(zerov, ox->dpf[i-1][(Q-1)*3 + p7X_I], 12); } u.v = vec_madd(xBv, *tp, zerov); tp++; path[0] = u.p[r]; u.v = vec_madd(mpv, *tp, zerov); tp++; path[1] = u.p[r]; u.v = vec_madd(ipv, *tp, zerov); tp++; path[2] = u.p[r]; u.v = vec_madd(dpv, *tp, zerov); path[3] = u.p[r]; esl_vec_FNorm(path, 4); return state[esl_rnd_FChoose(rng, path, 4)]; }
/* Function: p7_ParameterEstimation() * Incept: SRE, Sat Mar 24 10:15:37 2007 [Janelia] * * Purpose: Given an <hmm> containing collected, weighted counts; * and given a mixture Dirichlet prior <pri>; * calculate mean posterior parameter estimates for * all model parameters, converting the * HMM to a parameterized probabilistic model. * * Returns: <eslOK> on success. */ int p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri) { int k; double c[p7_MAXABET]; double p[p7_MAXABET]; double mix[p7_MAXDCHLET]; /* Match transitions 0,1..M: 0 is the B state * TMD at node M is 0. */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k], 3, c); esl_mixdchlet_MPParameters(c, 3, pri->tm, mix, p); esl_vec_D2F(p, 3, hmm->t[k]); } hmm->t[hmm->M][p7H_MD] = 0.0; esl_vec_FNorm(hmm->t[hmm->M], 3); /* Insert transitions, 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k]+3, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->ti, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+3); } /* Delete transitions, 1..M-1 * For k=0, which is unused; convention sets TMM=1.0, TMD=0.0 * For k=M, TMM = 1.0 (to the E state) and TMD=0.0 (no next D; must go to E). */ for (k = 1; k < hmm->M; k++) { esl_vec_F2D(hmm->t[k]+5, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->td, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+5); } hmm->t[0][p7H_DM] = hmm->t[hmm->M][p7H_DM] = 1.0; hmm->t[0][p7H_DD] = hmm->t[hmm->M][p7H_DD] = 0.0; /* Match emissions, 1..M * Convention sets mat[0] to a valid pvector: first elem 1, the rest 0. */ for (k = 1; k <= hmm->M; k++) { esl_vec_F2D(hmm->mat[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->em, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->mat[k]); } esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); hmm->mat[0][0] = 1.0; /* Insert emissions 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->ins[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->ei, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->ins[k]); } return eslOK; }
/* Function: CPlan9SWConfig() * EPN 05.30.06 * based on SRE's Plan7SWConfig() from HMMER's plan7.c * * Purpose: Set the alignment independent parameters of * a CM Plan 9 model to hmmsw (Smith/Waterman) configuration. * * Notes: The desideratum for begin/end probs is that all fragments ij * (starting at match i, ending at match j) are * equiprobable -- there is no information in the choice of * entry/exit. There are M(M+1)/2 possible choices of ij, so * each must get a probability of 2/M(M+1). This prob is the * product of a begin, an end, and all the not-end probs in * the path between i,j. * * Thus: entry/exit is asymmetric because of the left/right * nature of the HMM/profile. Entry probability is distributed * simply by assigning p_x = pentry / (M-1) to M-1 * internal match states. However, the same approach doesn't * lead to a flat distribution over exit points. Exit p's * must be corrected for the probability of a previous exit * from the model. Requiring a flat distribution over exit * points leads to an easily solved piece of algebra, giving: * p_1 = pexit / (M-1) * p_x = p_1 / (1 - (x-1) p_1) * * Modified EPN, Thu Feb 7 15:54:16 2008, as follows: * To better match a locally configured CM, if <do_match_local_cm> * we disallow insertions before the first (emitting) match state, * (from I_0), and after the final (emitting) match state, * (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR * which can never be entered in a locally configured CM * (b/c the ROOT_S state MUST jump into a local begin state, which * are always match states>). Also we disallow a M_0->D_1 transition * because these would be impossible in a locally configured CM. * * <do_match_local_cm> is usually TRUE, unless we're configuring * the CP9 specifically for eventual sub CM alignment, where * the goal is simply find the most likely start/end point * of the alignment with this CP9 (in that case we want * I_0 and I_M reachable). * * Args: hmm - the CM Plan 9 model w/ data-dep prob's valid * pentry - probability of an internal entry somewhere; * will be evenly distributed over M-1 match states * pexit - probability of an internal exit somewhere; * will be distributed over M-1 match states. * do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable * to better match a locally configured CM. * first_cm_ndtype - only used if do_match_local_cm is TRUE * if it's MATL or MATP then D_1 should be unreachable (it is in the CM) * if it's MATR or MATP then D_M should be unreachable (it is in the CM) * * Return: (void) * HMM probabilities are modified. */ void CPlan9SWConfig(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype) { float basep; /* p1 for exits: the base p */ int k; /* counter over states */ float d; /* No special (*x* states in Plan 7) states in CM Plan 9 */ /*for (k = 1; k <= hmm->M; k++) printf("before anything: end[%d]: %f\n", k, hmm->end[k]);*/ /* Configure entry. */ if(do_match_local_cm) { hmm->t[0][CTMI] = 0.; hmm->t[0][CTMM] = 0.; /* already was 0.0, transition from M_0 to M_1 is begin[1] */ hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */ if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */ hmm->t[0][CTMD] = 0.; } hmm->t[hmm->M][CTMI] = 0.; hmm->t[hmm->M][CTDI] = 0.; if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */ hmm->t[hmm->M][CTMD] = 0.; } /* renormalize transitions out of M_M */ d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M]; esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d); hmm->end[hmm->M] /= d; /* renormalize transitions out of D_M */ esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL])); esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1)); /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin) * and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0. */ /* Configure exit. * Don't touch hmm->end[hmm->M] */ basep = pexit / (float) (hmm->M-1); for (k = 1; k < hmm->M; k++) hmm->end[k] = basep / (1. - basep * (float) (k-1)); CPlan9RenormalizeExits(hmm, 1); /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/ hmm->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */ hmm->flags |= CPLAN9_LOCAL_BEGIN; /* local begins now on */ hmm->flags |= CPLAN9_LOCAL_END; /* local ends now on */ CP9Logoddsify(hmm); }
void esl_vec_FLogNorm(float *vec, int n) { float denom; denom = esl_vec_FLogSum(vec, n); esl_vec_FIncrement(vec, n, -1.*denom); esl_vec_FExp (vec, n); esl_vec_FNorm(vec, n); }
/* J(i) is reached from E(i) or J(i-1). */ static inline int select_j(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i) { float path[2]; int state[2] = { p7T_J, p7T_E }; path[0] = ox->xmx[(i-1)*p7X_NXCELLS+p7X_J] * om->xf[p7O_J][p7O_LOOP]; path[1] = ox->xmx[ i*p7X_NXCELLS+p7X_E] * om->xf[p7O_E][p7O_LOOP] * ox->xmx[i*p7X_NXCELLS+p7X_SCALE]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
/* B(i) is reached from N(i) or J(i). */ static inline int select_b(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i) { float path[2]; int state[2] = { p7T_N, p7T_J }; path[0] = ox->xmx[i*p7X_NXCELLS+p7X_N] * om->xf[p7O_N][p7O_MOVE]; path[1] = ox->xmx[i*p7X_NXCELLS+p7X_J] * om->xf[p7O_J][p7O_MOVE]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
int esl_hmm_PosteriorDecoding(const ESL_DSQ *dsq, int L, const ESL_HMM *hmm, ESL_HMX *fwd, ESL_HMX *bck, ESL_HMX *pp) { int i,k; for (i = 1; i <= L; i++) { for (k = 0; k < hmm->M; k++) pp->dp[i][k] = fwd->dp[i][k] * bck->dp[i][k]; esl_vec_FNorm(pp->dp[i], hmm->M); } return eslOK; }
/* I(i,k) is reached from M(i-1, k) or I(i-1,k). */ static inline int select_i(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; __m128 mpv = ox->dpf[i-1][q*3 + p7X_M]; __m128 ipv = ox->dpf[i-1][q*3 + p7X_I]; __m128 *tp = om->tfv + 7*q + p7O_MI; union { __m128 v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_I }; u.v = _mm_mul_ps(mpv, *tp); tp++; path[0] = u.p[r]; u.v = _mm_mul_ps(ipv, *tp); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
/* I(i,k) is reached from M(i-1, k) or I(i-1,k). */ static inline int select_i(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int k) { int Q = p7O_NQF(ox->M); int q = (k-1) % Q; /* (q,r) is position of the current DP cell D(i,k) */ int r = (k-1) / Q; vector float zerov; vector float mpv = ox->dpf[i-1][q*3 + p7X_M]; vector float ipv = ox->dpf[i-1][q*3 + p7X_I]; vector float *tp = om->tfv + 7*q + p7O_MI; union { vector float v; float p[4]; } u; float path[2]; int state[2] = { p7T_M, p7T_I }; zerov = (vector float) vec_splat_u32(0); u.v = vec_madd(mpv, *tp, zerov); tp++; path[0] = u.p[r]; u.v = vec_madd(ipv, *tp, zerov); path[1] = u.p[r]; esl_vec_FNorm(path, 2); return state[esl_rnd_FChoose(rng, path, 2)]; }
/* Function: CPlan9Renormalize() * * Purpose: Take an HMM in counts form, and renormalize * all of its probability vectors. Also enforces * CM Plan9 restrictions on nonexistent transitions. * * Args: hmm - the model to renormalize. * * Return: (void) * hmm is changed. */ void CPlan9Renormalize(CP9_t *hmm) { int k; /* counter for model position */ float d; /* denominator */ /* match emissions */ esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); /*M_0 is B state, non-emitter*/ for (k = 1; k <= hmm->M; k++) esl_vec_FNorm(hmm->mat[k], hmm->abc->K); /* insert emissions */ for (k = 0; k <= hmm->M; k++) esl_vec_FNorm(hmm->ins[k], hmm->abc->K); /* begin transitions */ d = esl_vec_FSum(hmm->begin+1, hmm->M) + hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]; /* hmm->t[0][CTMEL] should always be 0., can't local end from the M_0 == B state */ esl_vec_FScale(hmm->begin+1, hmm->M, 1./d); hmm->t[0][CTMI] /= d; hmm->t[0][CTMD] /= d; hmm->t[0][CTMEL] /= d; esl_vec_FNorm(hmm->t[0] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* transitions out of insert for node 0 (state N)*/ esl_vec_FSet (hmm->t[0] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE, 0.); /* main model transitions */ for (k = 1; k <= hmm->M; k++) /* safe for node M too, hmm->t[hmm->M][CTMM] should be 0.*/ { d = esl_vec_FSum(hmm->t[k], cp9_TRANS_NMATCH) + hmm->end[k]; esl_vec_FScale(hmm->t[k], cp9_TRANS_NMATCH, 1./d); hmm->end[k] /= d; esl_vec_FNorm(hmm->t[k] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* insert */ esl_vec_FNorm(hmm->t[k] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } /* null model emissions */ esl_vec_FNorm(hmm->null, hmm->abc->K); hmm->flags &= ~CPLAN9_HASBITS; /* clear the log-odds ready flag */ hmm->flags |= CPLAN9_HASPROB; /* set the probabilities OK flag */ }
/* Function: p7_null3_score() * * Purpose: Calculate a correction (in log_2 odds) to be applied * to a sequence, using a null model based on the * composition of the target sequence. * The null model is constructed /post hoc/ as the * distribution of the target sequence; if the target * sequence is 40% A, 5% C, 5% G, 40% T, then the null * model is (0.4, 0.05, 0.05, 0.4). This function is * based heavily on Infernal's ScoreCorrectionNull3(), * with two important changes: * - it leaves the log2 conversion from NATS to BITS * for the calling function. * - it doesn't include the omega score modifier * (based on prior probability of using the null3 * model), again leaving this to the calling function. * * Args: abc - alphabet for hit (only used to get alphabet size) * dsq - the sequence the hit resides in * tr - trace of the alignment, used to find the match states * (non-match chars are ignored in computing freq, not used if NULL) * start - start position of hit in dsq * stop - end position of hit in dsq * bg - background, used for the default null model's emission freq * ret_sc - RETURN: the correction to the score (in NATS); * caller subtracts this from hit score to get * corrected score. * Return: void, ret_sc: the log-odds score correction (in NATS). */ void p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc) { float score = 0.; int status; int i; float *freq; int dir; int tr_pos; ESL_ALLOC(freq, sizeof(float) * abc->K); esl_vec_FSet(freq, abc->K, 0.0); /* contract check */ if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", ""); if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", ""); if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", ""); dir = start < stop ? 1 : -1; if (tr != NULL) { /* skip the parts of the trace that precede the first match state */ tr_pos = 2; i = start; while (tr->st[tr_pos] != p7T_M) { if (tr->st[tr_pos] == p7T_N) i += dir; tr_pos++; } /* tally frequencies from characters hitting match state*/ while (tr->st[tr_pos] != p7T_E) { if (tr->st[tr_pos] == p7T_M) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } if (tr->st[tr_pos] != p7T_D ) i += dir; tr_pos++; } } else { /* tally frequencies from the full envelope */ for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } } esl_vec_FNorm(freq, abc->K); /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/ for (i = 0; i < abc->K; i++) score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ; /* Return the correction to the bit score. */ score = p7_FLogsum(0., score); *ret_sc = score; return; ERROR: esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", ""); return; /* never reached */ }
static void utest_pvectors(void) { char *msg = "pvector unit test failed"; double p1[4] = { 0.25, 0.25, 0.25, 0.25 }; double p2[4]; double p3[4]; float p1f[4]; float p2f[4] = { 0.0, 0.5, 0.5, 0.0 }; float p3f[4]; int n = 4; double result; esl_vec_D2F(p1, n, p1f); esl_vec_F2D(p2f, n, p2); if (esl_vec_DValidate(p1, n, 1e-12, NULL) != eslOK) esl_fatal(msg); if (esl_vec_FValidate(p1f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p1, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p1f, n); if (esl_DCompare(2.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DEntropy(p2, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FEntropy(p2f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p2, p1, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_FRelEntropy(p2f, p1f, n); if (esl_DCompare(1.0, result, 1e-9) != eslOK) esl_fatal(msg); result = esl_vec_DRelEntropy(p1, p2, n); if (result != eslINFINITY) esl_fatal(msg); result = esl_vec_FRelEntropy(p1f, p2f, n); if (result != eslINFINITY) esl_fatal(msg); esl_vec_DLog(p2, n); if (esl_vec_DLogValidate(p2, n, 1e-12, NULL) != eslOK) esl_fatal(msg); esl_vec_DExp(p2, n); if (p2[0] != 0.) esl_fatal(msg); esl_vec_FLog(p2f, n); if (esl_vec_FLogValidate(p2f, n, 1e-7, NULL) != eslOK) esl_fatal(msg); esl_vec_FExp(p2f, n); if (p2f[0] != 0.) esl_fatal(msg); esl_vec_DCopy(p2, n, p3); esl_vec_DScale(p3, n, 10.); esl_vec_DNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DLog(p3, n); result = esl_vec_DLogSum(p3, n); if (esl_DCompare(0.0, result, 1e-12) != eslOK) esl_fatal(msg); esl_vec_DIncrement(p3, n, 2.0); esl_vec_DLogNorm(p3, n); if (esl_vec_DCompare(p2, p3, n, 1e-12) != eslOK) esl_fatal(msg); esl_vec_FCopy(p2f, n, p3f); esl_vec_FScale(p3f, n, 10.); esl_vec_FNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FLog(p3f, n); result = esl_vec_FLogSum(p3f, n); if (esl_DCompare(0.0, result, 1e-7) != eslOK) esl_fatal(msg); esl_vec_FIncrement(p3f, n, 2.0); esl_vec_FLogNorm(p3f, n); if (esl_vec_FCompare(p2f, p3f, n, 1e-7) != eslOK) esl_fatal(msg); return; }
/* Function: p7_bg_Read() * Synopsis: Read background frequencies from a file. * * Purpose: Read new background frequencies from file <bgfile>, * overwriting the frequencies previously in the * <P7_BG> object <bg>. * * Note that <bg> is already created by the caller, not * created here. Also note that <p7_bg_Read()> only reads * residue background frequencies used for the "null * model", whereas a <P7_BG> object contains additional * information for the bias filter and for the biased * composition correction. * * Args: bgfile - file to read. * bg - existing <P7_BG> object provided by the caller. * errbuf - OPTIONAL: space for an error message, upon parse errors; or NULL. * * Returns: <eslOK> on success, and background frequencies in <bg> * are overwritten. * * <eslENOTFOUND> if <bgfile> can't be opened for reading. * <eslEFORMAT> if parsing of <bgfile> fails for some * reason. In both cases, <errbuf> contains a * user-directed error message upon return, including (if * relevant) the file name <bgfile> and the line number on * which an error was detected. <bg> is unmodified. * * Throws: <eslEMEM> on allocation failure; <bg> is unmodified, * and <errbuf> is empty. */ int p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf) { ESL_FILEPARSER *efp = NULL; float *fq = NULL; int n = 0; char *tok; int toklen; int alphatype; ESL_DSQ x; int status; if (errbuf) errbuf[0] = '\0'; status = esl_fileparser_Open(bgfile, NULL, &efp); if (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file %s for reading", bgfile); else if (status != eslOK) goto ERROR; esl_fileparser_SetCommentChar(efp, '#'); /* First token is alphabet type: amino | DNA | RNA */ status = esl_fileparser_GetToken(efp, &tok, &toklen); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; alphatype = esl_abc_EncodeType(tok); if (alphatype == eslUNKNOWN) ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile); else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile); ESL_ALLOC(fq, sizeof(float) * bg->abc->K); esl_vec_FSet(fq, bg->abc->K, -1.0); while ((status = esl_fileparser_NextLine(efp)) == eslOK) { status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (toklen != 1 || ! esl_abc_CIsCanonical(bg->abc, *tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); x = esl_abc_DigitizeSymbol(bg->abc, *tok); if (fq[x] != -1.0) ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile); n++; status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); fq[x] = atof(tok); status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslEOL) goto ERROR; } if (status != eslEOF) goto ERROR; if ( n != bg->abc->K) ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile); if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile); /* all checking complete. no more error cases. overwrite bg with the new frequencies */ esl_vec_FNorm(fq, bg->abc->K); esl_vec_FCopy(fq, bg->abc->K, bg->f); free(fq); esl_fileparser_Close(efp); return eslOK; ERROR: if (fq) free(fq); if (efp) esl_fileparser_Close(efp); return status; }
int main(int argc, char **argv) { ESL_GETOPTS *go; char *msafile; ESLX_MSAFILE *afp; ESL_MSA *msa; float *sqd; int status; int nbad; int nali = 0; int nbadali = 0; int nwgt = 0; int nbadwgt = 0; int i; int be_quiet; int do_gsc; int do_pb; int do_blosum; double maxid; double tol; int maxN; /* Process command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) esl_fatal("failed to parse cmd line: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") == TRUE) { puts(usage); puts("\n where options are:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=all docgroups; 2=indentation; 80=width */ return 0; } be_quiet = esl_opt_GetBoolean(go, "-q"); do_blosum = esl_opt_GetBoolean(go, "--blosum"); do_gsc = esl_opt_GetBoolean(go, "--gsc"); do_pb = esl_opt_GetBoolean(go, "--pb"); maxid = esl_opt_GetReal (go, "--id"); tol = esl_opt_GetReal (go, "--tol"); maxN = esl_opt_GetInteger(go, "--maxN"); if (esl_opt_ArgNumber(go) != 1) { puts("Incorrect number of command line arguments."); puts(usage); return 1; } msafile = esl_opt_GetArg(go, 1); esl_getopts_Destroy(go); /* Weight one or more alignments from input file */ if ((status = eslx_msafile_Open(NULL, msafile, NULL, eslMSAFILE_UNKNOWN, NULL, &afp)) != eslOK) eslx_msafile_OpenFailure(afp, status); while ( (status = eslx_msafile_Read(afp, &msa)) != eslEOF) { if (status != eslOK) eslx_msafile_ReadFailure(afp, status); if (maxN > 0 && msa->nseq > maxN) { esl_msa_Destroy(msa); continue; } nali++; nwgt += msa->nseq; ESL_ALLOC(sqd, sizeof(float) * msa->nseq); if (do_gsc) { esl_msaweight_GSC(msa); GSCWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_pb) { esl_msaweight_PB(msa); PositionBasedWeights(msa->aseq, msa->nseq, msa->alen, sqd); } else if (do_blosum) { esl_msaweight_BLOSUM(msa, maxid); BlosumWeights(msa->aseq, msa->nseq, msa->alen, maxid, sqd); /* workaround SQUID bug: BLOSUM weights weren't renormalized to sum to nseq. */ esl_vec_FNorm (sqd, msa->nseq); esl_vec_FScale(sqd, msa->nseq, (float) msa->nseq); } if (! be_quiet) { for (i = 0; i < msa->nseq; i++) fprintf(stdout, "%-20s %.3f %.3f\n", msa->sqname[i], msa->wgt[i], sqd[i]); } nbad = 0; for (i = 0; i < msa->nseq; i++) if (esl_DCompare((double) sqd[i], msa->wgt[i], tol) != eslOK) nbad++; if (nbad > 0) nbadali++; nbadwgt += nbad; if (nbad > 0) printf("%-20s :: alignment shows %d weights that differ (out of %d) \n", msa->name, nbad, msa->nseq); esl_msa_Destroy(msa); free(sqd); } eslx_msafile_Close(afp); if (nbadali == 0) printf("OK: all weights identical between squid and Easel in %d alignment(s)\n", nali); else { printf("%d of %d weights mismatched at (> %f fractional difference)\n", nbadwgt, nwgt, tol); printf("involving %d of %d total alignments\n", nbadali, nali); } return eslOK; ERROR: return status; }
/** * int main(int argc, char **argv) * Main driver */ int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; char *outhmmfile = NULL; P7_HMMFILE *hfp = NULL; FILE *outhmmfp; /* HMM output file handle */ P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float average_internal_transitions[ p7H_NTRANSITIONS ]; int k; char errmsg[eslERRBUFSIZE]; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { profillic_p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <input hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) { puts("Failed to read <output hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } profillic_p7_banner(stdout, argv[0], banner); /* Initializations: open the input HMM file for reading */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Initializations: open the output HMM file for writing */ if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if (bg == NULL) bg = p7_bg_Create(abc); esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.); for( k = 1; k < hmm->M; k++ ) { esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS); } // Match transitions esl_vec_FNorm(average_internal_transitions, 3); // Insert transitions esl_vec_FNorm(average_internal_transitions + 3, 2); // Delete transitions esl_vec_FNorm(average_internal_transitions + 5, 2); // Ok now set them. for( k = 1; k < hmm->M; k++ ) { esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] ); } if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed"); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); if (outhmmfp != NULL) fclose(outhmmfp); esl_getopts_Destroy(go); exit(0); }