/* Function: p7_GNull2_ByTrace() * Synopsis: Assign null2 scores to an envelope by the sampling method. * Incept: SRE, Thu May 1 10:00:43 2008 [Janelia] * * Purpose: Given a traceback <tr> for an alignment of model <gm> to * some target sequence; calculate null2 odds ratios $\frac{f'{x}}{f{x}}$ * as the state-usage-weighted emission probabilities, * with state usages calculated by counting emissions used * at positions <zstart..zend> in the trace. * * Because we only need to collect state usages from the * trace <tr>, the target sequence is irrelevant. Because * we are only averaging emission odds ratios from model * <gm>, the configuration of <gm> is irrelevant (uni * vs. multihit, or length config). * * Args: gm - model, in any configuration; only emission odds are used * tr - traceback for any region (or all) of a target sequence * zstart - first elem in <tr> to collect from; use 0 for complete * zend - last elem in <tr> to collect from; use tr->N-1 for complete * wrk - DP matrix w/ at least one row, for workspace * null2 - RESULT: odds ratios f'(x)/f(x) for all Kp residues * * Returns: <eslOK> on success, and the <ddef->n2sc> scores are set * for region <i..j>. * * Throws: <eslEMEM> on allocation error. */ int p7_GNull2_ByTrace(const P7_PROFILE *gm, const P7_TRACE *tr, int zstart, int zend, P7_GMX *wrk, float *null2) { float **dp = wrk->dp; /* so that {MDI}MX() macros work */ float *xmx = wrk->xmx; /* so that XMX() macro works */ int Ld = 0; int M = gm->M; int k; /* index over model position */ int x; /* index over residues */ int z; /* index over trace position */ float xfactor; /* We'll use the i=0 row in wrk for working space: dp[0][] and xmx[0..4]. */ esl_vec_FSet(wrk->dp[0], (M+1)*p7G_NSCELLS, 0.0); esl_vec_FSet(wrk->xmx, p7G_NXCELLS, 0.0); /* Calculate emitting state usage in this particular trace segment: */ for (z = zstart; z <= zend; z++) { switch (tr->st[z]) { case p7T_M: Ld++; MMX(0,tr->k[z]) += 1.0; break; case p7T_I: Ld++; IMX(0,tr->k[z]) += 1.0; break; case p7T_N: if (tr->st[z-1] == p7T_N) { Ld++; XMX(0,p7G_N) += 1.0; } break; case p7T_C: if (tr->st[z-1] == p7T_C) { Ld++; XMX(0,p7G_C) += 1.0; } break; case p7T_J: if (tr->st[z-1] == p7T_J) { Ld++; XMX(0,p7G_J) += 1.0; } break; } } esl_vec_FScale(wrk->dp[0], (M+1)*p7G_NSCELLS, (1.0 / (float) Ld)); esl_vec_FScale(wrk->xmx, p7G_NXCELLS, (1.0 / (float) Ld)); /* Calculate null2's odds ratio emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. */ esl_vec_FSet(null2, gm->abc->K, 0.0); xfactor = XMX(0,p7G_N) + XMX(0,p7G_C) + XMX(0,p7G_J); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] += MMX(0,k) * expf(p7P_MSC(gm, k, x)); null2[x] += IMX(0,k) * expf(p7P_ISC(gm, k, x)); } null2[x] += MMX(0,M) * expf(p7P_MSC(gm, M, x)); null2[x] += xfactor; } /* now null2[x] = \frac{f_d(x)}{f_0(x)} odds ratios for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_masstrace_Zero() * Synopsis: Initialize cumulative endpoint distributions to zeros. * * Purpose: Zero the cumulative distributions in <mt>, preparing to * collect masstrace endpoint data for a sequence of length * <L> and a profile of length <M>. * * Args: mt - mass trace object to collect endpoint data in * M - profile length * L - sequence length * * Returns: <eslOK> on success. */ int p7_masstrace_Zero(P7_MASSTRACE *mt, int M, int L) { /* contract checks / argument validation */ ESL_DASSERT1( (mt->imass == NULL || L+2 <= mt->ialloc ) ); ESL_DASSERT1( (M+2 <= mt->kalloc) ); if (mt->imass) esl_vec_FSet(mt->imass, L+2, 0.0f); esl_vec_FSet(mt->kmass, M+2, 0.0f); mt->L = L; mt->M = M; return eslOK; }
/* The MSV score can be validated against Viterbi (provided we trust * Viterbi), by creating a multihit local profile in which: * 1. All t_MM scores = 0 * 2. All other core transitions = -inf * 3. All t_BMk entries uniformly log 2/(M(M+1)) */ static void utest_msv(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, P7_PROFILE *gm, int nseq, int L) { P7_PROFILE *g2 = NULL; ESL_DSQ *dsq = NULL; P7_GMX *gx = NULL; float sc1, sc2; int k, idx; if ((dsq = malloc(sizeof(ESL_DSQ) *(L+2))) == NULL) esl_fatal("malloc failed"); if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("matrix creation failed"); if ((g2 = p7_profile_Clone(gm)) == NULL) esl_fatal("profile clone failed"); /* Make g2's scores appropriate for simulating the MSV algorithm in Viterbi */ esl_vec_FSet(g2->tsc, p7P_NTRANS * g2->M, -eslINFINITY); for (k = 1; k < g2->M; k++) p7P_TSC(g2, k, p7P_MM) = 0.0f; for (k = 0; k < g2->M; k++) p7P_TSC(g2, k, p7P_BM) = log(2.0f / ((float) g2->M * (float) (g2->M+1))); for (idx = 0; idx < nseq; idx++) { if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal("seq generation failed"); if (p7_GMSV (dsq, L, gm, gx, 2.0, &sc1) != eslOK) esl_fatal("MSV failed"); if (p7_GViterbi(dsq, L, g2, gx, &sc2) != eslOK) esl_fatal("viterbi failed"); if (fabs(sc1-sc2) > 0.0001) esl_fatal("MSV score not equal to Viterbi score"); } p7_gmx_Destroy(gx); p7_profile_Destroy(g2); free(dsq); return; }
/* Function: p7_bg_Create() * Incept: SRE, Fri Jan 12 13:32:51 2007 [Janelia] * * Purpose: Allocate a <P7_BG> object for digital alphabet <abc>, * initializes it to appropriate default values, and * returns a pointer to it. * * For protein models, default iid background frequencies * are set (by <p7_AminoFrequencies()>) to average * SwissProt residue composition. For DNA, RNA and other * alphabets, default frequencies are set to a uniform * distribution. * * The model composition <bg->mcomp[]> is not initialized * here; neither is the filter null model <bg->fhmm>. To * use the filter null model, caller will want to * initialize these fields by calling * <p7_bg_SetFilterByHMM()>. * * Throws: <NULL> on allocation failure. * * Xref: STL11/125. */ P7_BG * p7_bg_Create(const ESL_ALPHABET *abc) { P7_BG *bg = NULL; int status; ESL_ALLOC(bg, sizeof(P7_BG)); bg->f = NULL; bg->fhmm = NULL; ESL_ALLOC(bg->f, sizeof(float) * abc->K); if ((bg->fhmm = esl_hmm_Create(abc, 2)) == NULL) goto ERROR; if (abc->type == eslAMINO) { if (p7_AminoFrequencies(bg->f) != eslOK) goto ERROR; } else esl_vec_FSet(bg->f, abc->K, 1. / (float) abc->K); bg->p1 = 350./351.; bg->omega = 1./256.; bg->abc = abc; return bg; ERROR: p7_bg_Destroy(bg); return NULL; }
static void utest_ReadWrite(ESL_RANDOMNESS *rng) { char msg[] = "bg Read/Write unit test failed"; char tmpfile[32] = "esltmpXXXXXX"; FILE *fp = NULL; ESL_ALPHABET *abc = NULL; /* random alphabet choice eslRNA..eslDICE */ float *fq = NULL; P7_BG *bg = NULL; if ((abc = esl_alphabet_Create(esl_rnd_Roll(rng, 5) + 1)) == NULL) esl_fatal(msg); if (( bg = p7_bg_Create(abc)) == NULL) esl_fatal(msg); if (( fq = malloc(sizeof(float) * abc->K)) == NULL) esl_fatal(msg); do { if (esl_dirichlet_FSampleUniform(rng, abc->K, fq) != eslOK) esl_fatal(msg); } while (esl_vec_FMin(fq, abc->K) < 0.001); /* small p's will get rounded off and fail FCompare() */ esl_vec_FCopy(fq, abc->K, bg->f); if (esl_tmpfile_named(tmpfile, &fp) != eslOK) esl_fatal(msg); if ( p7_bg_Write(fp, bg) != eslOK) esl_fatal(msg); fclose(fp); esl_vec_FSet(bg->f, bg->abc->K, 0.0); if ( p7_bg_Read(tmpfile, bg, NULL) != eslOK) esl_fatal(msg); if ( esl_vec_FCompare(fq, bg->f, bg->abc->K, 0.01) != eslOK) esl_fatal(msg); p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); free(fq); remove(tmpfile); }
/* Function: p7_ParameterEstimation() * Incept: SRE, Sat Mar 24 10:15:37 2007 [Janelia] * * Purpose: Given an <hmm> containing collected, weighted counts; * and given a mixture Dirichlet prior <pri>; * calculate mean posterior parameter estimates for * all model parameters, converting the * HMM to a parameterized probabilistic model. * * Returns: <eslOK> on success. */ int p7_ParameterEstimation(P7_HMM *hmm, const P7_PRIOR *pri) { int k; double c[p7_MAXABET]; double p[p7_MAXABET]; double mix[p7_MAXDCHLET]; /* Match transitions 0,1..M: 0 is the B state * TMD at node M is 0. */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k], 3, c); esl_mixdchlet_MPParameters(c, 3, pri->tm, mix, p); esl_vec_D2F(p, 3, hmm->t[k]); } hmm->t[hmm->M][p7H_MD] = 0.0; esl_vec_FNorm(hmm->t[hmm->M], 3); /* Insert transitions, 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->t[k]+3, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->ti, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+3); } /* Delete transitions, 1..M-1 * For k=0, which is unused; convention sets TMM=1.0, TMD=0.0 * For k=M, TMM = 1.0 (to the E state) and TMD=0.0 (no next D; must go to E). */ for (k = 1; k < hmm->M; k++) { esl_vec_F2D(hmm->t[k]+5, 2, c); esl_mixdchlet_MPParameters(c, 2, pri->td, mix, p); esl_vec_D2F(p, 2, hmm->t[k]+5); } hmm->t[0][p7H_DM] = hmm->t[hmm->M][p7H_DM] = 1.0; hmm->t[0][p7H_DD] = hmm->t[hmm->M][p7H_DD] = 0.0; /* Match emissions, 1..M * Convention sets mat[0] to a valid pvector: first elem 1, the rest 0. */ for (k = 1; k <= hmm->M; k++) { esl_vec_F2D(hmm->mat[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->em, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->mat[k]); } esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); hmm->mat[0][0] = 1.0; /* Insert emissions 0..M */ for (k = 0; k <= hmm->M; k++) { esl_vec_F2D(hmm->ins[k], hmm->abc->K, c); esl_mixdchlet_MPParameters(c, hmm->abc->K, pri->ei, mix, p); esl_vec_D2F(p, hmm->abc->K, hmm->ins[k]); } return eslOK; }
/* Function: ZeroCPlan9() * * Purpose: Zeros the counts/probabilities fields in a model. * Leaves null model untouched. */ void ZeroCPlan9(CP9_t *hmm) { int k; esl_vec_FSet(hmm->ins[0], hmm->abc->K, 0.); esl_vec_FSet(hmm->t[0], cp9_NTRANS, 0.); for (k = 1; k <= hmm->M; k++) { esl_vec_FSet(hmm->t[k], cp9_NTRANS, 0.); esl_vec_FSet(hmm->mat[k], hmm->abc->K, 0.); esl_vec_FSet(hmm->ins[k], hmm->abc->K, 0.); } esl_vec_FSet(hmm->begin+1, hmm->M, 0.); esl_vec_FSet(hmm->end+1, hmm->M, 0.); /* initialize the el_* data structures, these * depend on the CM guide tree and will be set * when the CP9 is constructed from the CM. */ for (k = 0; k <= (hmm->M); k++) { hmm->has_el[k] = FALSE; hmm->el_from_ct[k] = 0; hmm->el_from_idx[k] = NULL; hmm->el_from_cmnd[k] = NULL; } /* special case hmm->M+1 corresponds to the E state here */ hmm->el_from_ct[(hmm->M+1)] = 0; hmm->el_from_idx[(hmm->M+1)] = NULL; hmm->el_from_cmnd[(hmm->M+1)] = NULL; hmm->flags &= ~CPLAN9_HASBITS; /* invalidates scores */ hmm->flags &= ~CPLAN9_HASPROB; /* invalidates probabilities */ hmm->el_self = 0.; /* EL self transition probability */ }
/* Function: CPlan9SWConfig() * EPN 05.30.06 * based on SRE's Plan7SWConfig() from HMMER's plan7.c * * Purpose: Set the alignment independent parameters of * a CM Plan 9 model to hmmsw (Smith/Waterman) configuration. * * Notes: The desideratum for begin/end probs is that all fragments ij * (starting at match i, ending at match j) are * equiprobable -- there is no information in the choice of * entry/exit. There are M(M+1)/2 possible choices of ij, so * each must get a probability of 2/M(M+1). This prob is the * product of a begin, an end, and all the not-end probs in * the path between i,j. * * Thus: entry/exit is asymmetric because of the left/right * nature of the HMM/profile. Entry probability is distributed * simply by assigning p_x = pentry / (M-1) to M-1 * internal match states. However, the same approach doesn't * lead to a flat distribution over exit points. Exit p's * must be corrected for the probability of a previous exit * from the model. Requiring a flat distribution over exit * points leads to an easily solved piece of algebra, giving: * p_1 = pexit / (M-1) * p_x = p_1 / (1 - (x-1) p_1) * * Modified EPN, Thu Feb 7 15:54:16 2008, as follows: * To better match a locally configured CM, if <do_match_local_cm> * we disallow insertions before the first (emitting) match state, * (from I_0), and after the final (emitting) match state, * (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR * which can never be entered in a locally configured CM * (b/c the ROOT_S state MUST jump into a local begin state, which * are always match states>). Also we disallow a M_0->D_1 transition * because these would be impossible in a locally configured CM. * * <do_match_local_cm> is usually TRUE, unless we're configuring * the CP9 specifically for eventual sub CM alignment, where * the goal is simply find the most likely start/end point * of the alignment with this CP9 (in that case we want * I_0 and I_M reachable). * * Args: hmm - the CM Plan 9 model w/ data-dep prob's valid * pentry - probability of an internal entry somewhere; * will be evenly distributed over M-1 match states * pexit - probability of an internal exit somewhere; * will be distributed over M-1 match states. * do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable * to better match a locally configured CM. * first_cm_ndtype - only used if do_match_local_cm is TRUE * if it's MATL or MATP then D_1 should be unreachable (it is in the CM) * if it's MATR or MATP then D_M should be unreachable (it is in the CM) * * Return: (void) * HMM probabilities are modified. */ void CPlan9SWConfig(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype) { float basep; /* p1 for exits: the base p */ int k; /* counter over states */ float d; /* No special (*x* states in Plan 7) states in CM Plan 9 */ /*for (k = 1; k <= hmm->M; k++) printf("before anything: end[%d]: %f\n", k, hmm->end[k]);*/ /* Configure entry. */ if(do_match_local_cm) { hmm->t[0][CTMI] = 0.; hmm->t[0][CTMM] = 0.; /* already was 0.0, transition from M_0 to M_1 is begin[1] */ hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */ if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */ hmm->t[0][CTMD] = 0.; } hmm->t[hmm->M][CTMI] = 0.; hmm->t[hmm->M][CTDI] = 0.; if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */ hmm->t[hmm->M][CTMD] = 0.; } /* renormalize transitions out of M_M */ d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M]; esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d); hmm->end[hmm->M] /= d; /* renormalize transitions out of D_M */ esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL])); esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1)); /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin) * and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0. */ /* Configure exit. * Don't touch hmm->end[hmm->M] */ basep = pexit / (float) (hmm->M-1); for (k = 1; k < hmm->M; k++) hmm->end[k] = basep / (1. - basep * (float) (k-1)); CPlan9RenormalizeExits(hmm, 1); /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/ hmm->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */ hmm->flags |= CPLAN9_LOCAL_BEGIN; /* local begins now on */ hmm->flags |= CPLAN9_LOCAL_END; /* local ends now on */ CP9Logoddsify(hmm); }
/* Function: CPlan9Renormalize() * * Purpose: Take an HMM in counts form, and renormalize * all of its probability vectors. Also enforces * CM Plan9 restrictions on nonexistent transitions. * * Args: hmm - the model to renormalize. * * Return: (void) * hmm is changed. */ void CPlan9Renormalize(CP9_t *hmm) { int k; /* counter for model position */ float d; /* denominator */ /* match emissions */ esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); /*M_0 is B state, non-emitter*/ for (k = 1; k <= hmm->M; k++) esl_vec_FNorm(hmm->mat[k], hmm->abc->K); /* insert emissions */ for (k = 0; k <= hmm->M; k++) esl_vec_FNorm(hmm->ins[k], hmm->abc->K); /* begin transitions */ d = esl_vec_FSum(hmm->begin+1, hmm->M) + hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]; /* hmm->t[0][CTMEL] should always be 0., can't local end from the M_0 == B state */ esl_vec_FScale(hmm->begin+1, hmm->M, 1./d); hmm->t[0][CTMI] /= d; hmm->t[0][CTMD] /= d; hmm->t[0][CTMEL] /= d; esl_vec_FNorm(hmm->t[0] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* transitions out of insert for node 0 (state N)*/ esl_vec_FSet (hmm->t[0] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE, 0.); /* main model transitions */ for (k = 1; k <= hmm->M; k++) /* safe for node M too, hmm->t[hmm->M][CTMM] should be 0.*/ { d = esl_vec_FSum(hmm->t[k], cp9_TRANS_NMATCH) + hmm->end[k]; esl_vec_FScale(hmm->t[k], cp9_TRANS_NMATCH, 1./d); hmm->end[k] /= d; esl_vec_FNorm(hmm->t[k] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* insert */ esl_vec_FNorm(hmm->t[k] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } /* null model emissions */ esl_vec_FNorm(hmm->null, hmm->abc->K); hmm->flags &= ~CPLAN9_HASBITS; /* clear the log-odds ready flag */ hmm->flags |= CPLAN9_HASPROB; /* set the probabilities OK flag */ }
/* Function: p7_profile_SameAsMF() * Synopsis: Set a generic profile's scores to give MSV scores. * Incept: MSF Tue Nov 3, 2009 [Janelia] * * Purpose: Set a generic profile's scores so that the normal <dp_generic> DP * algorithms will give the same score as <p7_MSVFilter()>: * all t_MM scores = 0; all other core transitions = -inf; * multihit local mode; all <t_BMk> entries uniformly <log 2/(M(M+1))>; * <tCC, tNN, tJJ> scores 0; total approximated later as -3; * rounded in the same way as the 8-bit limited precision. * * Returns: <eslOK> on success. */ int p7_profile_SameAsMF(const P7_OPROFILE *om, P7_PROFILE *gm) { int k; float tbm = log(2.0f / ((float) gm->M * (float) (gm->M+1))); /* Transitions */ esl_vec_FSet(gm->tsc, p7P_NTRANS * gm->M, -eslINFINITY); for (k = 1; k < gm->M; k++) p7P_TSC(gm, k, p7P_MM) = 0.0f; for (k = 0; k < gm->M; k++) p7P_TSC(gm, k, p7P_BM) = tbm; return eslOK; }
/* Function: CPlan9CMLocalBeginConfig() * Incept: EPN, Thu Jun 21 15:43:29 2007 * based on SRE's Plan7SWConfig() from HMMER's plan7.c * * Purpose: Set up a CM Plan 9 HMM to mimic CM local begins as closely * as it can. We can't enforce that a begin/end point are chosen * the same way a CM's are, as the choice of a CM local begin * (in non-truncated CYK mode) defines both a start and end point, * and some start/end combinations are impossible. For the CP9 * we allow all possible start/end combos. * * Args: cm - the CM, must have valid cm->cp9, we'll use * the CM local begin probs to set the cm->cp9s * begin/end probs. * * Return: (void) * HMM probabilities are modified. */ void CPlan9CMLocalBeginConfig(CM_t *cm) { CMEmitMap_t *emap; /* consensus emit map for the CM */ int nd; /* Contract checks */ if(cm->cp9 == NULL) cm_Fail("ERROR in CPlan9CMLocalBeginConfig, cm->cp9 is NULL.\n"); if(cm->cp9map == NULL) cm_Fail("ERROR in CPlan9CMLocalBeginConfig, cm->cp9map is NULL.\n"); if(!(cm->flags & CMH_CP9)) cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CMH_CP9 flag is down."); if(!(cm->flags & CMH_LOCAL_BEGIN)) cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CMH_LOCAL_BEGIN flag is down."); if(!(cm->flags & CMH_LOCAL_END)) cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CP9_LOCAL_BEGIN flag is already up."); if(cm->cp9->flags & CPLAN9_LOCAL_END) cm_Fail("ERROR in CPlan9CMLocalBeginConfig, CP9_LOCAL_END flag is already up."); /* Configure entry. * To match CM, we enforce the only way out of the B state (M_0) * is through a local begin into a match state */ esl_vec_FSet(cm->cp9->begin, cm->cp9->M, 0.); emap = CreateEmitMap(cm); for (nd = 1; nd < cm->nodes; nd++) { if(NOT_IMPOSSIBLE(cm->begin[cm->nodemap[nd]])) { cm->cp9->begin[emap->lpos[nd]] += cm->begin[cm->nodemap[nd]]; /* we do += b/c for lpos of BIFs, there's > 1 way to enter there, the BIF and the first MATP or MATL of the left child of the BIF */ } } cm->cp9->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */ cm->cp9->flags |= CPLAN9_LOCAL_BEGIN; /* local begins now on */ cm->cp9->flags |= CPLAN9_LOCAL_END; /* local ends now on */ CP9Logoddsify(cm->cp9); }
/* Function: p7_bg_CreateUniform() * Synopsis: Creates background model with uniform freqs. * Incept: SRE, Sat Jun 30 10:25:27 2007 [Janelia] * * Purpose: Creates a background model for alphabet <abc> * with uniform residue frequencies. */ P7_BG * p7_bg_CreateUniform(const ESL_ALPHABET *abc) { P7_BG *bg = NULL; int status; ESL_ALLOC(bg, sizeof(P7_BG)); bg->f = NULL; bg->fhmm = NULL; ESL_ALLOC(bg->f, sizeof(float) * abc->K); if ((bg->fhmm = esl_hmm_Create(abc, 2)) == NULL) goto ERROR; esl_vec_FSet(bg->f, abc->K, 1. / (float) abc->K); bg->p1 = 350./351.; bg->omega = 1./256.; bg->abc = (ESL_ALPHABET *) abc; /* safe: we're just keeping a reference */ return bg; ERROR: p7_bg_Destroy(bg); return NULL; }
/* Function: p7_bg_Create() * Synopsis: Create a <P7_BG> null model object. * * Purpose: Allocate a <P7_BG> object for digital alphabet <abc>, * initializes it to appropriate default values, and * returns a pointer to it. * * For protein models, default iid background frequencies * are set (by <p7_AminoFrequencies()>) to average * Swiss-Prot residue composition. For DNA, RNA and other * alphabets, default frequencies are set to a uniform * distribution. * * The model composition <bg->mcomp[]> is not initialized * here; neither is the filter null model <bg->fhmm>. To * use the filter null model, caller will want to * initialize these fields by calling * <p7_bg_SetFilter()>. * * Throws: <NULL> on allocation failure. * * Xref: STL11/125. */ P7_BG * p7_bg_Create(const ESL_ALPHABET *abc) { P7_BG *bg = NULL; int status; ESL_ALLOC(bg, sizeof(P7_BG)); bg->f = NULL; bg->fhmm = NULL; // this is not hard-coded for alphabet size... ESL_ALLOC(bg->f, sizeof(float) * abc->K); if ((bg->fhmm = esl_hmm_Create(abc, 2)) == NULL) goto ERROR; if (abc->type == eslAMINO) { if (p7_AminoFrequencies(bg->f) != eslOK) goto ERROR; } // adding in background probabilities for music intervals else if (abc->type == eslMUSIC) { if (p7_MusicFrequencies(bg->f) != eslOK) goto ERROR; } else esl_vec_FSet(bg->f, abc->K, 1. / (float) abc->K); bg->p1 = 350./351.; bg->omega = 1./256.; bg->abc = abc; return bg; ERROR: p7_bg_Destroy(bg); return NULL; }
/* Function: p7_null3_score() * * Purpose: Calculate a correction (in log_2 odds) to be applied * to a sequence, using a null model based on the * composition of the target sequence. * The null model is constructed /post hoc/ as the * distribution of the target sequence; if the target * sequence is 40% A, 5% C, 5% G, 40% T, then the null * model is (0.4, 0.05, 0.05, 0.4). This function is * based heavily on Infernal's ScoreCorrectionNull3(), * with two important changes: * - it leaves the log2 conversion from NATS to BITS * for the calling function. * - it doesn't include the omega score modifier * (based on prior probability of using the null3 * model), again leaving this to the calling function. * * Args: abc - alphabet for hit (only used to get alphabet size) * dsq - the sequence the hit resides in * tr - trace of the alignment, used to find the match states * (non-match chars are ignored in computing freq, not used if NULL) * start - start position of hit in dsq * stop - end position of hit in dsq * bg - background, used for the default null model's emission freq * ret_sc - RETURN: the correction to the score (in NATS); * caller subtracts this from hit score to get * corrected score. * Return: void, ret_sc: the log-odds score correction (in NATS). */ void p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc) { float score = 0.; int status; int i; float *freq; int dir; int tr_pos; ESL_ALLOC(freq, sizeof(float) * abc->K); esl_vec_FSet(freq, abc->K, 0.0); /* contract check */ if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", ""); if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", ""); if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", ""); dir = start < stop ? 1 : -1; if (tr != NULL) { /* skip the parts of the trace that precede the first match state */ tr_pos = 2; i = start; while (tr->st[tr_pos] != p7T_M) { if (tr->st[tr_pos] == p7T_N) i += dir; tr_pos++; } /* tally frequencies from characters hitting match state*/ while (tr->st[tr_pos] != p7T_E) { if (tr->st[tr_pos] == p7T_M) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } if (tr->st[tr_pos] != p7T_D ) i += dir; tr_pos++; } } else { /* tally frequencies from the full envelope */ for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } } esl_vec_FNorm(freq, abc->K); /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/ for (i = 0; i < abc->K; i++) score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ; /* Return the correction to the bit score. */ score = p7_FLogsum(0., score); *ret_sc = score; return; ERROR: esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", ""); return; /* never reached */ }
/* Function: cm_tr_penalties_Create() * Date: EPN, Sat Jan 21 12:03:52 2012 * * Purpose: Allocate and initialize a CM_TR_PENALTIES object. * A CM and its emit map are required to determine * truncation penalty scores. This is annoyingly * complex, see verbose notes within code below. * * Some of the code in this function, specifically * that which calculates the probability of a fragment * aligning at a given node, is checkable, but only * if we disallow truncated begins into insert states. * However, we want to allow truncated begins in reality. * I've left in a flag for ignoring inserts (<ignore_inserts>) * I used in testing this function. Set it to TRUE to * perform the test. * * Returns: Newly allocated CM_TR_PENALTIES object. NULL if out * of memory. */ CM_TR_PENALTIES * cm_tr_penalties_Create(CM_t *cm, int ignore_inserts, char *errbuf) { int status; int v, nd, m, i1, i2; int lpos, rpos; int i; /* variables used for determining ratio of inserts to match at each consensus position */ float *mexpocc = NULL; /* [0..c..clen] probability match state is used to emit at cons posn c */ float *iexpocc = NULL; /* [0..c..clen] probability insert state is used to emit after cons posn c */ double *psi = NULL; /* [0..v..M-1] expected occupancy of state v */ float m_psi, i1_psi, i2_psi; /* temp psi values */ float summed_psi; CM_TR_PENALTIES *trp = NULL; /* variables used for calculating global truncation penalties */ float g_5and3; /* fragment probability if 5' and 3' truncation are allowed */ float g_5or3; /* fragment probability if 5' or 3' truncation are allowed */ /* variables used for calculating local truncation penalties */ float *begin = NULL; /* local begin probabilities 0..v..M-1 */ int subtree_clen; /* consensus length of subtree under this node */ float prv53, prv5, prv3; /* previous node's fragment probability, 5'&3', 5' only, 3'only */ float cur53, cur5, cur3; /* current node's fragment probability, 5'&3', 5' only, 3'only */ int nfrag53, nfrag5, nfrag3; /* number of fragments, 5'&3', 5' only, 3'only */ if(cm == NULL || cm->emap == NULL) goto ERROR; ESL_ALLOC(trp, sizeof(CM_TR_PENALTIES)); trp->M = cm->M; trp->ignored_inserts = ignore_inserts; /* Define truncation penalties for each state v. This will be * the score for doing a truncated begin into state v. * * Important note: For this discussion we assume that sequences can * only be truncated at consensus positions, which means we don't * have to worry about truncated begins into inserts. This is an * approximation (also made by Diana and Sean in the 2009 trCYK * paper) that greatly simplifies the explanation of the calculation * of the truncation penalties. The examples in my ELN3 notebook * also use this simplification. However, I need to be able to do * truncated begins into insert states in some cases (some pass/mode * combinations see ELN bottom of p.47). I explain first the * rationale for calculating truncation penalties ignoring inserts * and then I describe how I adapt those penalties to allow * for inserts. * * This is a lengthy comment. I've divided it into 3 sections: * Section 1. Global mode truncation penalties, ignoring inserts. * Section 2. Local mode truncation penalties, ignoring inserts. * Section 3. Adapting truncation penalties to allow for inserts. * ************************************************************** * Section 1. Global mode truncation penalties, ignoring inserts. * * We want the truncation penalty to be the log of the probability * that the particular fragment we're aligning was generated from * the following generative process. The generative process differs * between global and local mode. * * In global mode: * o Sample global parsetree which spans consensus positions 1..clen. * o Randomly choose g and h in range 1..clen, where h >= g and * truncate sequence from g..h. The first residue will either be * an insert before position g, or a match at position g of the * model. The final residue will either be an insert after position * h or a match at position h of the model. * * All g,h fragments are equiprobable, so the probability of any * particular fragment is 2 / (clen * (clen+1)). So log_2 of this * value is the truncation penalty for all truncated alignments in * global mode where both 5' and 3' truncation are allowed. * * We store this penalty, per-state in the * g_ptyAA[TRPENALTY_5P_AND_3P][0..v..M-1]. The penalty is * identical for all emitting states. The penalty value for * non-emitters is IMPOSSIBLE because truncated begins are * not allowed into non-emitters. * * If only 5' OR 3' truncation is allowed, we only truncate at g or * h, which menas there's 1/clen possible fragments and log_2 * (1/clen) is our global truncation penalty. * * However, if 5' truncation is allowed we can only do a truncated * begin into states that with a consensus subtree that spans * position clen (since we don't allow a truncation at the 3' end). * Thus any state whose subtree that doesn't span clen gets * an IMPOSSIBLE value for its truncation score in: * g_ptyAA[TRPENALTY_5P_ONLY][0..v..M-1]. * * Likewise, if 3' truncation is allowed we can only do a truncated * begin into states that with a consensus subtree that spans * position 1 (since we don't allow a truncation at the 5' end). * * There's an example of computing all three types of penalties for * a simple CM in ELN 3 p43. * ************************************************************ * Section 2. Local mode truncation penalties, ignoring inserts. * * Generative process that generates fragments in local mode: * o Sample local begin state b with consensus subtree from i..j from * local begin state distribution. * o Randomly choose g and h in range i..j, where h >= g and * truncate sequence from g..h. The first residue will either be * an insert before position g, or a match at position g of the * model. The final residue will either be an insert after position * h or a match at position h of the model. * * Unlike in global mode, in local mode all fragments are not * equiprobable since the local begin state distribution can be * anything, and each b allows different sets of fragments to be * generated (because they can only span from i to j). * * The truncation penalty should be the log of the probability of * aligning the current fragment to the model. So we need to know * the probability of generating each possible fragment. * We could calculate probability of any fragment g,h with the * following inefficient algorithm: * * For each start fragment point g, * For each start fragment point h, * For each state v, * If lpos[v] <= g && rpos[v] >= h, then * prob[g][h] += begin[v] * 2. / (st_clen[v] * (st_clen[v]+1)); * * Where lpos[v]/rpos[v] are the left/right consensus positions in * consensus subtree rooted at state v. And st_clen[v] is rpos[v] - * lpos[v] + 1, the consensus length of that subtree. * * This gives us prob[g][h], the probability of generating fragment * g,h. But we want to apply the penalty to a state, not to a * fragment, to avoid needing to know the fragment boundaries g,h * during the DP recursion when applying the penalty. * * To facilitate this, we need to find state t, the state with * smallest subtree that contains g,h. State t is relevant because * it is the state which will root the alignment of the fragment g,h * by using a truncated begin transition into t. This gives a new * algorithm: * * For each start fragment point g, * For each start fragment point h, * Identify state t, the max valued state for which * lpos[v] <= g && rpos[v] >= h, then { * prob[t] += prob[g][h] * fcount[t]++; * } * * prob[t] will be the probability of observing an alignment that * uses a truncated begin into t to align any fragment. Then we take * average over all fragments: prob[t] / fcount[t] (since we'll only * be aligning one of those fragments) and use the log of that * probability as the penalty for observing a truncated alignment * rooted at state t. Conveniently, it turns out that all fragments * that share t are equiprobable (have equal prob[g][h] values), so * the average probability is the actual probability for each * fragment, and thus the correct penalty to apply. * * Fortunately, we can compute the correct penalty much more * efficiently than the two algorithms shown above. The * efficient way is implemented below. A test that the penalties * are correctly computed is in cm_tr_penalties_Validate(). * * This discussion assumes we're truncating 5' and 3', but if we're * only truncating 5' or 3' The situation is a little different. * * There's an example of computing all three types of penalties for * a simple CM in ELN3 p44-45. * ************************************************************ * Section 3. Adapting truncation penalties to allow for inserts. * * We need to be able to do truncated begins into insert states * because we enforce that the first/final residue of a sequence be * included in 5'/3' truncated alignments and we want to be able * to properly align those residues if they're probably emitted * by insert states. * * The methods/logic explained in sections 1 and 2 above I believe * is correct IF we ignore inserts (assume truncated begins into * them are impossible). But we need to allow inserts, so I modify * the truncation penalties as described above to allow for inserts * as follows. We can calculate the appropriate truncated begin * penalty for all MATP_MP, MATL_ML, MATR_MR, BIF_B states as with * the methods described above by ignoring inserts. This gives us a * probability p of using that state as the root of the truncated * alignment, i.e. the truncated begin state. (The log_2 of this * probability is the penalty.) We then partition p amongst the * MATP_MP, MATL_ML, MATR_MR, BIF_B states and any parent insert * states, i.e. any insert state that can transition into the * match/bif state. For each match/bif state there's 0, 1 or 2 * parent inserts. We then partition p based on the relative * expected occupancy of these inserts versus the match/bif state. * * This is certainly 'incorrect' in that it doesn't reflect the * true probability of a fragment being aligned to each of the * states, but it should be a close approximation. I think doing * it correctly is basically impossible in the context of a single * state-specific penalty (i.e. the penalty would have to be per-fragment * which would be hard to deal with in the DP functions). */ /* allocate and initialize the penalty arrays */ ESL_ALLOC(trp->g_ptyAA, sizeof(float *) * NTRPENALTY); ESL_ALLOC(trp->l_ptyAA, sizeof(float *) * NTRPENALTY); ESL_ALLOC(trp->ig_ptyAA, sizeof(int *) * NTRPENALTY); ESL_ALLOC(trp->il_ptyAA, sizeof(int *) * NTRPENALTY); for(i = 0; i < NTRPENALTY; i++) { trp->g_ptyAA[i] = NULL; trp->l_ptyAA[i] = NULL; trp->il_ptyAA[i] = NULL; trp->ig_ptyAA[i] = NULL; ESL_ALLOC(trp->g_ptyAA[i], sizeof(float) * cm->M); ESL_ALLOC(trp->l_ptyAA[i], sizeof(float) * cm->M); ESL_ALLOC(trp->ig_ptyAA[i], sizeof(int) * cm->M); ESL_ALLOC(trp->il_ptyAA[i], sizeof(int) * cm->M); esl_vec_FSet(trp->g_ptyAA[i], cm->M, IMPOSSIBLE); esl_vec_FSet(trp->l_ptyAA[i], cm->M, IMPOSSIBLE); esl_vec_ISet(trp->ig_ptyAA[i], cm->M, -INFTY); esl_vec_ISet(trp->il_ptyAA[i], cm->M, -INFTY); } /* DumpEmitMap(stdout, cm->emap, cm); */ /* Calculate local begin probabilities and expected occupancy */ ESL_ALLOC(begin, sizeof(float) * cm->M); cm_CalculateLocalBeginProbs(cm, cm->pbegin, cm->t, begin); if((status = cm_ExpectedPositionOccupancy(cm, &mexpocc, &iexpocc, &psi, NULL, NULL, NULL)) != eslOK) goto ERROR; /* Fill global and local truncation penalties in a single loop. We * step through all nodes and set the truncation penalties for the * MATP_MP, MATL_ML, MATR_MR, and BIF_B states and any parent * inserts (i1, i2) of those states. */ g_5and3 = 2. / (cm->clen * (cm->clen+1)); /* for global mode: probability of all fragments if we're truncating 5' and 3' */ g_5or3 = 1. / cm->clen; /* for global mode: probability of all fragments if we're only truncating 5' or 3' */ prv5 = prv3 = prv53 = 0.; /* initialize 'previous' probability values used for calc'ing local truncation penalties */ for(nd = 0; nd < cm->nodes; nd++) { lpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd) ? cm->emap->lpos[nd] : cm->emap->lpos[nd] + 1; rpos = (cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATR_nd) ? cm->emap->rpos[nd] : cm->emap->rpos[nd] - 1; /* now set penalties for match and insert states m, i1 and maybe i2 (if we're a MATP_MP or BIF_B) */ if(cm->ndtype[nd] == END_nd) { prv5 = prv3 = prv53 = 0.; } else if(cm->ndtype[nd] == BEGL_nd || cm->ndtype[nd] == BEGR_nd) { prv5 = (cm->ndtype[nd] == BEGL_nd) ? 0. : trp->l_ptyAA[TRPENALTY_5P_ONLY][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */; prv3 = (cm->ndtype[nd] == BEGR_nd) ? 0. : trp->l_ptyAA[TRPENALTY_3P_ONLY][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */; prv53 = trp->l_ptyAA[TRPENALTY_5P_AND_3P][cm->plast[cm->nodemap[nd]]]; /* parent BIF_B's probability */ } else if(cm->ndtype[nd] == MATP_nd || cm->ndtype[nd] == MATL_nd || cm->ndtype[nd] == MATR_nd || cm->ndtype[nd] == BIF_nd) { /* determine match states and insert states that pertain to this node */ m = cm->nodemap[nd]; /* MATP_MP, MATL_ML, MATR_MR, or BIF_B */ InsertsGivenNodeIndex(cm, nd-1, &i1, &i2); m_psi = psi[m]; if(cm->ndtype[nd] == MATP_MP) { m_psi += (psi[m+1] + psi[m+2]); } /* include MATP_ML and MATP_MR psi */ i1_psi = (i1 == -1) ? 0. : psi[i1]; i2_psi = (i2 == -1) ? 0. : psi[i2]; summed_psi = m_psi + i1_psi + i2_psi; if(ignore_inserts) { i1_psi = i2_psi = 0.; summed_psi = m_psi; } /* Global penalties */ /* sanity check, we should only set truncation penalty once per state */ if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][m])) goto ERROR; if((i1 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR; if((i2 != -1) && NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR; /* divide up the probability g_5and3 amongst relevant states m, i1, i2, weighted by psi */ trp->g_ptyAA[TRPENALTY_5P_AND_3P][m] = (m_psi / summed_psi) * g_5and3; if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * g_5and3; if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * g_5and3; /* same thing, for 5P only and 3P only */ if(rpos == cm->clen) { /* else it will remain IMPOSSIBLE */ trp->g_ptyAA[TRPENALTY_5P_ONLY][m] = (m_psi / summed_psi) * g_5or3; if(i1 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3; if(i2 != -1) trp->g_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3; } if(lpos == 1) { /* else it will remain IMPOSSIBLE */ trp->g_ptyAA[TRPENALTY_3P_ONLY][m] = (m_psi / summed_psi) * g_5or3; if(i1 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * g_5or3; if(i2 != -1) trp->g_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * g_5or3; } /* Local penalties */ subtree_clen = rpos - lpos + 1; nfrag5 = subtree_clen; nfrag3 = subtree_clen; nfrag53 = (subtree_clen * (subtree_clen+1)) / 2; /* determine probability of observing a fragment aligned at * state m (here, m is what I call t above and in notes) and * partition that probability between m and i1 and/or i2 by * relative occupancy of match versus inserts */ cur5 = begin[m] / (float) nfrag5 + prv5; cur3 = begin[m] / (float) nfrag3 + prv3; cur53 = begin[m] / (float) nfrag53 + prv53; /* sanity check, we should only set truncation penalty once per state */ if(NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][m])) goto ERROR; if((i1 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1])) goto ERROR; if((i2 != -1) && NOT_IMPOSSIBLE(trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2])) goto ERROR; trp->l_ptyAA[TRPENALTY_5P_AND_3P][m] = (m_psi / summed_psi) * cur53; if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i1] = (i1_psi / summed_psi) * cur53; if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_AND_3P][i2] = (i2_psi / summed_psi) * cur53; trp->l_ptyAA[TRPENALTY_5P_ONLY][m] = (m_psi / summed_psi) * cur5; if(i1 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i1] = (i1_psi / summed_psi) * cur5; if(i2 != -1) trp->l_ptyAA[TRPENALTY_5P_ONLY][i2] = (i2_psi / summed_psi) * cur5; trp->l_ptyAA[TRPENALTY_3P_ONLY][m] = (m_psi / summed_psi) * cur3; if(i1 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i1] = (i1_psi / summed_psi) * cur3; if(i2 != -1) trp->l_ptyAA[TRPENALTY_3P_ONLY][i2] = (i2_psi / summed_psi) * cur3; prv5 = (cm->ndtype[nd] == MATL_nd) ? cur5 : 0.; prv3 = (cm->ndtype[nd] == MATR_nd) ? cur3 : 0.; prv53 = cur53; } } /* all penalties are currently probabilities, convert them to log * probs and set integer penalties (careful, we have to check if * IMPOSSIBLE first) */ for(v = 0; v < cm->M; v++) { if((cm->stid[v] == MATP_MP || cm->stid[v] == MATL_ML || cm->stid[v] == MATR_MR || cm->stid[v] == BIF_B) || ((cm->sttype[v] == IL_st || cm->sttype[v] == IR_st) && (! StateIsDetached(cm, v)))) { /* Check for rare special case: if we're a MATP_IL and next * two states are MATP_IR and END_E, then we won't have set * a trunction penalty. This state will keep an impossible * truncated begin score, if we did a truncated begin into * it we'd just emit from the MATP_IL and then go to the * END_E anyway (the MATP_IR will be detached. */ if(cm->stid[v] == MATP_IL && cm->ndtype[cm->ndidx[v]+1] == END_nd) continue; /* glocal 5P AND 3P: all of these should have been set to a non-IMPOSSIBLE value */ if(! NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR; trp->ig_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0); trp->g_ptyAA[TRPENALTY_5P_AND_3P][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_AND_3P][v]); /* glocal 5P only: some may be IMPOSSIBLE */ if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_5P_ONLY][v])) { trp->ig_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_5P_ONLY][v], 1.0); trp->g_ptyAA[TRPENALTY_5P_ONLY][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_5P_ONLY][v]); } /* glocal 5P only: some may be IMPOSSIBLE */ if(NOT_IMPOSSIBLE(trp->g_ptyAA[TRPENALTY_3P_ONLY][v])) { trp->ig_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->g_ptyAA[TRPENALTY_3P_ONLY][v], 1.0); trp->g_ptyAA[TRPENALTY_3P_ONLY][v] = sreLOG2(trp->g_ptyAA[TRPENALTY_3P_ONLY][v]); } /* local penalties all of these should have been set to a non-IMPOSSIBLE value */ if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_AND_3P][v])) goto ERROR; if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_5P_ONLY][v])) goto ERROR; if(! NOT_IMPOSSIBLE(trp->il_ptyAA[TRPENALTY_3P_ONLY][v])) goto ERROR; trp->il_ptyAA[TRPENALTY_5P_AND_3P][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v], 1.0); trp->il_ptyAA[TRPENALTY_5P_ONLY][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_5P_ONLY][v], 1.0); trp->il_ptyAA[TRPENALTY_3P_ONLY][v] = Prob2Score(trp->l_ptyAA[TRPENALTY_3P_ONLY][v], 1.0); trp->l_ptyAA[TRPENALTY_5P_AND_3P][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_AND_3P][v]); trp->l_ptyAA[TRPENALTY_5P_ONLY][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_5P_ONLY][v]); trp->l_ptyAA[TRPENALTY_3P_ONLY][v] = sreLOG2(trp->l_ptyAA[TRPENALTY_3P_ONLY][v]); } } if(ignore_inserts) { if((status = cm_tr_penalties_Validate(trp, cm, 0.0001, errbuf)) != eslOK) { printf("%s", errbuf); goto ERROR; } } /* cm_tr_penalties_Dump(stdout, cm, trp); */ if(mexpocc != NULL) free(mexpocc); if(iexpocc != NULL) free(iexpocc); if(psi != NULL) free(psi); if(begin != NULL) free(begin); return trp; ERROR: if(mexpocc != NULL) free(mexpocc); if(iexpocc != NULL) free(iexpocc); if(psi != NULL) free(psi); if(begin != NULL) free(begin); if(trp != NULL) cm_tr_penalties_Destroy(trp); return NULL; }
/** * int main(int argc, char **argv) * Main driver */ int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; /* command line processing */ ESL_ALPHABET *abc = NULL; char *hmmfile = NULL; char *outhmmfile = NULL; P7_HMMFILE *hfp = NULL; FILE *outhmmfp; /* HMM output file handle */ P7_HMM *hmm = NULL; P7_BG *bg = NULL; int nhmm; double x; float KL; int status; char errbuf[eslERRBUFSIZE]; float average_internal_transitions[ p7H_NTRANSITIONS ]; int k; char errmsg[eslERRBUFSIZE]; /* Process the command line options. */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK || esl_opt_VerifyConfig(go) != eslOK) { printf("Failed to parse command line: %s\n", go->errbuf); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if (esl_opt_GetBoolean(go, "-h") == TRUE) { profillic_p7_banner(stdout, argv[0], banner); esl_usage(stdout, argv[0], usage); puts("\nOptions:"); esl_opt_DisplayHelp(stdout, go, 0, 2, 80); /* 0=docgroup, 2 = indentation; 80=textwidth*/ exit(0); } if (esl_opt_ArgNumber(go) != 2) { puts("Incorrect number of command line arguments."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((hmmfile = esl_opt_GetArg(go, 1)) == NULL) { puts("Failed to read <input hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } if ((outhmmfile = esl_opt_GetArg(go, 2)) == NULL) { puts("Failed to read <output hmmfile> argument from command line."); esl_usage(stdout, argv[0], usage); printf("\nTo see more help on available options, do %s -h\n\n", argv[0]); exit(1); } profillic_p7_banner(stdout, argv[0], banner); /* Initializations: open the input HMM file for reading */ status = p7_hmmfile_OpenE(hmmfile, NULL, &hfp, errbuf); if (status == eslENOTFOUND) p7_Fail("File existence/permissions problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status == eslEFORMAT) p7_Fail("File format problem in trying to open HMM file %s.\n%s\n", hmmfile, errbuf); else if (status != eslOK) p7_Fail("Unexpected error %d in opening HMM file %s.\n%s\n", status, hmmfile, errbuf); /* Initializations: open the output HMM file for writing */ if ((outhmmfp = fopen(outhmmfile, "w")) == NULL) ESL_FAIL(status, errmsg, "Failed to open HMM file %s for writing", outhmmfile); /* Main body: read HMMs one at a time, print one line of stats */ printf("#\n"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "idx", "name", "accession", "nseq", "eff_nseq", "M", "relent", "info", "p relE", "compKL"); printf("# %-4s %-20s %-12s %8s %8s %6s %6s %6s %6s %6s\n", "----", "--------------------", "------------", "--------", "--------", "------", "------", "------", "------", "------"); nhmm = 0; while ((status = p7_hmmfile_Read(hfp, &abc, &hmm)) != eslEOF) { if (status == eslEOD) esl_fatal("read failed, HMM file %s may be truncated?", hmmfile); else if (status == eslEFORMAT) esl_fatal("bad file format in HMM file %s", hmmfile); else if (status == eslEINCOMPAT) esl_fatal("HMM file %s contains different alphabets", hmmfile); else if (status != eslOK) esl_fatal("Unexpected error in reading HMMs from %s", hmmfile); nhmm++; if (bg == NULL) bg = p7_bg_Create(abc); esl_vec_FSet(average_internal_transitions, p7H_NTRANSITIONS, 0.); for( k = 1; k < hmm->M; k++ ) { esl_vec_FAdd(average_internal_transitions, hmm->t[k], p7H_NTRANSITIONS); } // Match transitions esl_vec_FNorm(average_internal_transitions, 3); // Insert transitions esl_vec_FNorm(average_internal_transitions + 3, 2); // Delete transitions esl_vec_FNorm(average_internal_transitions + 5, 2); // Ok now set them. for( k = 1; k < hmm->M; k++ ) { esl_vec_FCopy( average_internal_transitions, p7H_NTRANSITIONS, hmm->t[k] ); } if ((status = p7_hmm_Validate(hmm, errmsg, 0.0001)) != eslOK) return status; if ((status = p7_hmmfile_WriteASCII(outhmmfp, -1, hmm)) != eslOK) ESL_FAIL(status, errmsg, "HMM save failed"); p7_MeanPositionRelativeEntropy(hmm, bg, &x); p7_hmm_CompositionKLDist(hmm, bg, &KL, NULL); printf("%-6d %-20s %-12s %8d %8.2f %6d %6.2f %6.2f %6.2f %6.2f\n", nhmm, hmm->name, hmm->acc == NULL ? "-" : hmm->acc, hmm->nseq, hmm->eff_nseq, hmm->M, p7_MeanMatchRelativeEntropy(hmm, bg), p7_MeanMatchInfo(hmm, bg), x, KL); /* p7_MeanForwardScore(hmm, bg)); */ p7_hmm_Destroy(hmm); } p7_bg_Destroy(bg); esl_alphabet_Destroy(abc); p7_hmmfile_Close(hfp); if (outhmmfp != NULL) fclose(outhmmfp); esl_getopts_Destroy(go); exit(0); }
/* Function: p7_GNull2_ByExpectation() * Synopsis: Calculate null2 model from posterior probabilities. * Incept: SRE, Thu Feb 28 09:52:28 2008 [Janelia] * * Purpose: Calculate the "null2" model for the envelope encompassed * by a posterior probability calculation <pp> for model * <gm>. Return the null2 odds emission probabilities * $\frac{f'{x}}{f{x}}$ in <null2>, which caller * provides as space for at least <alphabet->Kp> residues. * * The expectation method is applied to envelopes in * simple, well resolved regions (regions containing just a * single envelope, where no stochastic traceback * clustering was required). * * Make sure that the posterior probability matrix <pp> has * been calculated by the caller for only the envelope; thus * its rows are numbered <1..Ld>, for envelope <ienv..jenv> * of length <Ld=jenv-ienv+1>. * * Args: gm - profile, in any mode, target length model set to <L> * pp - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset) * null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space * * Returns: <eslOK> on success; <null2> contains the null2 scores. The 0 * row of <pp> has been used as temp space, and happens to contain * the expected frequency that each M,I,N,C,J state is used in this * <pp> matrix to generate residues. * * Throws: (no abnormal error conditions) */ int p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2) { int M = gm->M; int Ld = pp->L; float **dp = pp->dp; float *xmx = pp->xmx; float xfactor; int x; /* over symbols 0..K-1 */ int i; /* over offset envelope dsq positions 1..Ld */ int k; /* over model M states 1..M, I states 1..M-1 */ /* Calculate expected # of times that each emitting state was used * in generating the Ld residues in this domain. * The 0 row in <wrk> is used to hold these numbers. */ esl_vec_FCopy(pp->dp[1], (M+1)*p7G_NSCELLS, pp->dp[0]); esl_vec_FCopy(pp->xmx+p7G_NXCELLS, p7G_NXCELLS, pp->xmx); for (i = 2; i <= Ld; i++) { esl_vec_FAdd(pp->dp[0], pp->dp[i], (M+1)*p7G_NSCELLS); esl_vec_FAdd(pp->xmx, pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); } /* Convert those expected #'s to log frequencies; these we'll use as * the log posterior weights. */ esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS); esl_vec_FLog(pp->xmx, p7G_NXCELLS); esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld)); esl_vec_FIncrement(pp->xmx, p7G_NXCELLS, -log((float)Ld)); /* Calculate null2's log odds emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. * This is dog-slow; a point for future optimization. */ xfactor = XMX(0,p7G_N); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C)); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J)); esl_vec_FSet(null2, gm->abc->K, -eslINFINITY); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x)); } null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], xfactor); } esl_vec_FExp (null2, gm->abc->K); /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing */ null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_bg_Read() * Synopsis: Read background frequencies from a file. * * Purpose: Read new background frequencies from file <bgfile>, * overwriting the frequencies previously in the * <P7_BG> object <bg>. * * Note that <bg> is already created by the caller, not * created here. Also note that <p7_bg_Read()> only reads * residue background frequencies used for the "null * model", whereas a <P7_BG> object contains additional * information for the bias filter and for the biased * composition correction. * * Args: bgfile - file to read. * bg - existing <P7_BG> object provided by the caller. * errbuf - OPTIONAL: space for an error message, upon parse errors; or NULL. * * Returns: <eslOK> on success, and background frequencies in <bg> * are overwritten. * * <eslENOTFOUND> if <bgfile> can't be opened for reading. * <eslEFORMAT> if parsing of <bgfile> fails for some * reason. In both cases, <errbuf> contains a * user-directed error message upon return, including (if * relevant) the file name <bgfile> and the line number on * which an error was detected. <bg> is unmodified. * * Throws: <eslEMEM> on allocation failure; <bg> is unmodified, * and <errbuf> is empty. */ int p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf) { ESL_FILEPARSER *efp = NULL; float *fq = NULL; int n = 0; char *tok; int toklen; int alphatype; ESL_DSQ x; int status; if (errbuf) errbuf[0] = '\0'; status = esl_fileparser_Open(bgfile, NULL, &efp); if (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file %s for reading", bgfile); else if (status != eslOK) goto ERROR; esl_fileparser_SetCommentChar(efp, '#'); /* First token is alphabet type: amino | DNA | RNA */ status = esl_fileparser_GetToken(efp, &tok, &toklen); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; alphatype = esl_abc_EncodeType(tok); if (alphatype == eslUNKNOWN) ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile); else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile); ESL_ALLOC(fq, sizeof(float) * bg->abc->K); esl_vec_FSet(fq, bg->abc->K, -1.0); while ((status = esl_fileparser_NextLine(efp)) == eslOK) { status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (toklen != 1 || ! esl_abc_CIsCanonical(bg->abc, *tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); x = esl_abc_DigitizeSymbol(bg->abc, *tok); if (fq[x] != -1.0) ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile); n++; status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); fq[x] = atof(tok); status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslEOL) goto ERROR; } if (status != eslEOF) goto ERROR; if ( n != bg->abc->K) ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile); if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile); /* all checking complete. no more error cases. overwrite bg with the new frequencies */ esl_vec_FNorm(fq, bg->abc->K); esl_vec_FCopy(fq, bg->abc->K, bg->f); free(fq); esl_fileparser_Close(efp); return eslOK; ERROR: if (fq) free(fq); if (efp) esl_fileparser_Close(efp); return status; }
/* Function: p7_profile_Create() * Synopsis: Allocates a profile. * * Purpose: Allocates for a profile of up to <M> nodes, for digital * alphabet <abc>. * * Because this function might be in the critical path (in * hmmscan, for example), we leave much of the model * uninitialized, including scores and length model * probabilities. The <p7_profile_Config()> call is what * sets these. * * The reference pointer <gm->abc> is set to <abc>. * * Returns: a pointer to the newly allocated profile. * * Throws: <NULL> on allocation error. */ P7_PROFILE * p7_profile_Create(int allocM, const ESL_ALPHABET *abc) { P7_PROFILE *gm = NULL; int x; int status; /* level 0 */ ESL_ALLOC(gm, sizeof(P7_PROFILE)); gm->tsc = NULL; gm->rsc = NULL; gm->name = NULL; gm->acc = NULL; gm->desc = NULL; gm->rf = NULL; gm->mm = NULL; gm->cs = NULL; gm->consensus = NULL; /* level 1 */ ESL_ALLOC(gm->tsc, sizeof(float) * (allocM+1) * p7P_NTRANS); /* 0..M */ ESL_ALLOC(gm->rsc, sizeof(float *) * abc->Kp); ESL_ALLOC(gm->rf, sizeof(char) * (allocM+2)); /* yes, +2: each is (0)1..M, +trailing \0 */ ESL_ALLOC(gm->mm, sizeof(char) * (allocM+2)); ESL_ALLOC(gm->cs, sizeof(char) * (allocM+2)); ESL_ALLOC(gm->consensus, sizeof(char) * (allocM+2)); gm->rsc[0] = NULL; /* level 2 */ ESL_ALLOC(gm->rsc[0], sizeof(float) * abc->Kp * (allocM+1) * p7P_NR); for (x = 1; x < abc->Kp; x++) gm->rsc[x] = gm->rsc[0] + x * (allocM+1) * p7P_NR; /* Initialization of tsc[0], including removal of I0. tsc[k-1,LM],tsc[k-1,GM] will be configured + overwritten later */ esl_vec_FSet(gm->tsc, p7P_NTRANS, -eslINFINITY); /* tsc[M] initialized and Im removed when we know actual M : see modelconfig.c */ for (x = 0; x < abc->Kp; x++) { P7P_MSC(gm, 0, x) = -eslINFINITY; /* no emissions from nonexistent M_0... */ P7P_ISC(gm, 0, x) = -eslINFINITY; /* nor I_0... */ /* I_M is initialized in profile config, when we know actual M, not just allocated max M */ } x = esl_abc_XGetGap(abc); /* no emission can emit/score gap characters */ esl_vec_FSet(gm->rsc[x], (allocM+1)*p7P_NR, -eslINFINITY); x = esl_abc_XGetMissing(abc); /* no emission can emit/score missing data characters */ esl_vec_FSet(gm->rsc[x], (allocM+1)*p7P_NR, -eslINFINITY); /* Set remaining info */ gm->M = 0; gm->allocM = allocM; gm->L = -1; /* "unset" flag */ gm->nj = -1.0f; /* "unset" flag */ gm->pglocal = -1.0f; /* "unset" flag */ gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; gm->name = NULL; gm->acc = NULL; gm->desc = NULL; gm->rf[0] = 0; /* RF line is optional annotation; this flags that it's not set yet */ gm->mm[0] = 0; /* likewise for MM annotation line */ gm->cs[0] = 0; /* likewise for CS annotation line */ gm->consensus[0] = 0; for (x = 0; x < p7_NEVPARAM; x++) gm->evparam[x] = p7_EVPARAM_UNSET; for (x = 0; x < p7_NCUTOFFS; x++) gm->cutoff[x] = p7_CUTOFF_UNSET; for (x = 0; x < p7_MAXABET; x++) gm->compo[x] = p7_COMPO_UNSET; gm->max_length = -1; /* "unset" */ gm->abc = abc; return gm; ERROR: p7_profile_Destroy(gm); return NULL; }
/* glocal_region_trace_ensemble() * EPN, Tue Oct 5 10:13:25 2010 * * Based on p7_domaindef.c's region_trace_ensemble(). Modified so that * generic matrices (which can be used for glocally configured models) * can be used. An additional parameter <do_null2> has been added, * so that null2-related calculations are only done if necessary. * That is, they're skipped if null2 has been turned off in the pipeline. * * Notes from p7_domaindef.c::region_trace_ensemble(): *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * SRE, Fri Feb 8 11:49:44 2008 [Janelia] * * Here, we've decided that region <ireg>..<jreg> in sequence <dsq> might be * composed of more than one domain, and we're going to use clustering * of a posterior ensemble of stochastic tracebacks to sort it out. * * Caller provides a filled Forward matrix in <fwd> for the sequence * region <dsq+ireg-1>, length <jreg-ireg+1>, for the model <om> * configured in multihit mode with its target length distribution * set to the total length of <dsq>: i.e., the same model * configuration used to score the complete sequence (if it weren't * multihit, we wouldn't be worried about multiple domains). * * Caller also provides a DP matrix in <wrk> containing at least one * row, for use as temporary workspace. (This will typically be the * caller's Backwards matrix, which we haven't yet used at this point * in the processing pipeline.) * * Caller provides <ddef>, which defines heuristic parameters that * control the clustering, and provides working space for the * calculation and the answers. The <ddef->sp> object must have been * reused (i.e., it needs to be fresh; we're going to use it here); * the caller needs to Reuse() it specifically, because it can't just * Reuse() the whole <ddef>, when it's in the process of analyzing * regions. * * Upon return, <*ret_nc> contains the number of clusters that were * defined. * * The caller can retrieve info on each cluster by calling * <p7_spensemble_GetClusterCoords(ddef->sp...)> on the * <P7_SPENSEMBLE> object in <ddef>. * * Other information on what's happened in working memory: * * <ddef->n2sc[ireg..jreg]> now contains log f'(x_i) / f(x_i) null2 scores * for each residue. * * <ddef->sp> gets filled in, and upon return, it's holding the answers * (the cluster definitions). When the caller is done retrieving those * answers, it needs to <esl_spensemble_Reuse()> it before calling * <region_trace_ensemble()> again. * * <ddef->tr> is used as working memory for sampled traces. * * <wrk> has had its zero row clobbered as working space for a null2 calculation. *~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ static int glocal_region_trace_ensemble(P7_DOMAINDEF *ddef, const P7_PROFILE *gm, const ESL_DSQ *dsq, int ireg, int jreg, const P7_GMX *fwd, P7_GMX *wrk, int do_null2, int *ret_nc) { int Lr = jreg-ireg+1; int t, d, d2; int nov, n; int nc; int pos; float null2[p7_MAXCODE]; esl_vec_FSet(ddef->n2sc+ireg, Lr, 0.0); /* zero the null2 scores in region */ /* By default, we make results reproducible by forcing a reset of * the RNG to its originally seeded state. */ if (ddef->do_reseeding) esl_randomness_Init(ddef->r, esl_randomness_GetSeed(ddef->r)); /* Collect an ensemble of sampled traces; calculate null2 odds ratios from these if nec */ for (t = 0; t < ddef->nsamples; t++) { p7_GStochasticTrace(ddef->r, dsq+ireg-1, Lr, gm, fwd, ddef->tr); p7_trace_Index(ddef->tr); pos = 1; for (d = 0; d < ddef->tr->ndom; d++) { p7_spensemble_Add(ddef->sp, t, ddef->tr->sqfrom[d]+ireg-1, ddef->tr->sqto[d]+ireg-1, ddef->tr->hmmfrom[d], ddef->tr->hmmto[d]); if(do_null2) { p7_GNull2_ByTrace(gm, ddef->tr, ddef->tr->tfrom[d], ddef->tr->tto[d], wrk, null2); /* residues outside domains get bumped +1: because f'(x) = f(x), so f'(x)/f(x) = 1 in these segments */ for (; pos <= ddef->tr->sqfrom[d]; pos++) ddef->n2sc[ireg+pos-1] += 1.0; /* Residues inside domains get bumped by their null2 ratio */ for (; pos <= ddef->tr->sqto[d]; pos++) ddef->n2sc[ireg+pos-1] += null2[dsq[ireg+pos-1]]; } } if(do_null2) { /* the remaining residues in the region outside any domains get +1 */ for (; pos <= Lr; pos++) ddef->n2sc[ireg+pos-1] += 1.0; } p7_trace_Reuse(ddef->tr); } /* Convert the accumulated n2sc[] ratios in this region to log odds null2 scores on each residue. */ if(do_null2) { for (pos = ireg; pos <= jreg; pos++) ddef->n2sc[pos] = logf(ddef->n2sc[pos] / (float) ddef->nsamples); } /* Cluster the ensemble of traces to break region into envelopes. */ p7_spensemble_Cluster(ddef->sp, ddef->min_overlap, ddef->of_smaller, ddef->max_diagdiff, ddef->min_posterior, ddef->min_endpointp, &nc); /* A little hacky now. Remove "dominated" domains relative to seq coords. */ for (d = 0; d < nc; d++) ddef->sp->assignment[d] = 0; /* overload <assignment> to flag that a domain is dominated */ /* who dominates who? (by post prob) */ for (d = 0; d < nc; d++) { for (d2 = d+1; d2 < nc; d2++) { nov = ESL_MIN(ddef->sp->sigc[d].j, ddef->sp->sigc[d2].j) - ESL_MAX(ddef->sp->sigc[d].i, ddef->sp->sigc[d2].i) + 1; if (nov == 0) break; n = ESL_MIN(ddef->sp->sigc[d].j - ddef->sp->sigc[d].i + 1, ddef->sp->sigc[d2].j - ddef->sp->sigc[d2].i + 1); if ((float) nov / (float) n >= 0.8) /* overlap */ { if (ddef->sp->sigc[d].prob > ddef->sp->sigc[d2].prob) ddef->sp->assignment[d2] = 1; else ddef->sp->assignment[d] = 1; } } } /* shrink the sigc list, removing dominated domains */ d = 0; for (d2 = 0; d2 < nc; d2++) { if (ddef->sp->assignment[d2]) continue; /* skip domain d2, it's dominated. */ if (d != d2) memcpy(ddef->sp->sigc + d, ddef->sp->sigc + d2, sizeof(struct p7_spcoord_s)); d++; } ddef->sp->nc = d; *ret_nc = d; return eslOK; }
/* Function: p7_domaindef_GlocalByPosteriorHeuristics() * Synopsis: Define glocal domains in a sequence using posterior probs. * Incept: EPN, Tue Oct 5 10:02:34 2010 * SRE, Sat Feb 23 08:17:44 2008 [Janelia] (p7_domaindef_ByPosteriorHeuristics()) * * Purpose: Given a sequence <sq> and model <gm> for which we have * already calculated a Forward and Backward parsing * matrices <gxf> and <gxb>; use posterior probability * heuristics to determine an annotated domain structure; * and for each domain found, score it (with null2 * calculations) and obtain an optimal accuracy alignment, * using <fwd> and <bck> matrices as workspace for the * necessary full-matrix DP calculations. Caller provides a * new or reused <ddef> object to hold these results. * * As a special case, if the profile is in unihit mode * upon entering, we don't ever modify its configuration. * This is especially important if this function is * being used within a search/scan pipeline with a * specially configured p7 profile in which N->N and/or * C->C transitions have been set to IMPOSSIBLE. (If * we were to call ReconfigLength() on such a profile * we would make those transitions possible.) * * One case in which profile reconfiguration is necessary * is when multiple domains are suspected. However, we * guard against this if the profile enters in unihit mode * by no allowing multiple domains (in fact, it should * never happen because J states are unreachable in unihit * profiles). If multiple domains are suspected in this case, * we return eslEINCONCEIVABLE. * * Upon return, <ddef> contains the definitions of all the * domains: their bounds, their null-corrected Forward * scores, and their optimal posterior accuracy alignments. * * <do_null2> is TRUE if we'll eventually apply a null2 * penalty FALSE if not. If FALSE, we can save time by * skipping Backward calls at some stages. * * Returns: <eslOK> on success. * * <eslERANGE> on numeric overflow in posterior * decoding. This should not be possible for multihit * models. * * <eslEINCONCEIVABLE> if profile enters as unihit but * multiple domains are suspected. */ int p7_domaindef_GlocalByPosteriorHeuristics(const ESL_SQ *sq, P7_PROFILE *gm, P7_GMX *gxf, P7_GMX *gxb, P7_GMX *fwd, P7_GMX *bck, P7_DOMAINDEF *ddef, int do_null2) { int i, j; int triggered; int d; int i2,j2; int last_j2; int nc; int saveL = gm->L; /* Save the length config of <om>; will restore upon return */ int save_mode = gm->mode; /* Likewise for the mode. */ int status; int save_mode_is_unihit; save_mode_is_unihit = (p7_IsMulti(save_mode)) ? FALSE : TRUE; /* if save_mode_is_unihit is TRUE, we never modify profile's configuration (length nor mode) */ if ((status = p7_domaindef_GrowTo(ddef, sq->n)) != eslOK) return status; /* ddef's btot,etot,mocc now ready for seq of length n */ /*printf("GDD P7 mode: %d\n", gm->mode);*/ if ((status = p7_GDomainDecoding(gm, gxf, gxb, ddef)) != eslOK) return status; /* ddef->{btot,etot,mocc} now made. */ /*printf("In p7_domaindef_GlocalByPosteriorHeuristics(): mode: %d rt1: %g rt2: %g rt3: %g nsamples: %d reseed: %d\n", save_mode, ddef->rt1, ddef->rt2, ddef->rt3, ddef->nsamples, ddef->do_reseeding);*/ esl_vec_FSet(ddef->n2sc, sq->n+1, 0.0); /* ddef->n2sc null2 scores are initialized */ ddef->nexpected = ddef->btot[sq->n]; /* posterior expectation for # of domains (same as etot[sq->n]) */ if(! save_mode_is_unihit) p7_ReconfigUnihit(gm, saveL); /* process each domain in unihit mode, regardless of gm->mode */ i = -1; triggered = FALSE; for (j = 1; j <= sq->n; j++) { /*printf("GDD j: %5d m: %.5f b: %8.3f e: %8.3f bhere: %8.3f ehere: %8.3f\n", j, ddef->mocc[j], ddef->btot[j], ddef->etot[j], ddef->btot[j] - ddef->btot[j-1], ddef->etot[j] - ddef->etot[j-1]); */ if (! triggered) { /* xref J2/101 for what the logic below is: */ if (ddef->mocc[j] - (ddef->btot[j] - ddef->btot[j-1]) < ddef->rt2) i = j; else if (i == -1) i = j; if (ddef->mocc[j] >= ddef->rt1) triggered = TRUE; } else if (ddef->mocc[j] - (ddef->etot[j] - ddef->etot[j-1]) < ddef->rt2) { /* We have a region i..j to evaluate. */ p7_gmx_GrowTo(fwd, gm->M, j-i+1); p7_gmx_GrowTo(bck, gm->M, j-i+1); ddef->nregions++; if (is_multidomain_region(ddef, i, j)) { if(save_mode_is_unihit) return eslEINCONCEIVABLE; /* This region appears to contain more than one domain, so we have to * resolve it by cluster analysis of posterior trace samples, to define * one or more domain envelopes. */ ddef->nclustered++; /* Resolve the region into domains by stochastic trace * clustering; assign position-specific null2 model by * stochastic trace clustering; there is redundancy * here; we will consolidate later if null2 strategy * works */ p7_ReconfigMultihit(gm, saveL); p7_GForward(sq->dsq+i-1, j-i+1, gm, fwd, NULL); glocal_region_trace_ensemble(ddef, gm, sq->dsq, i, j, fwd, bck, do_null2, &nc); p7_ReconfigUnihit(gm, saveL); /* ddef->n2sc is now set on i..j by the traceback-dependent method */ last_j2 = 0; for (d = 0; d < nc; d++) { p7_spensemble_GetClusterCoords(ddef->sp, d, &i2, &j2, NULL, NULL, NULL); if (i2 <= last_j2) ddef->noverlaps++; /* Note that k..m coords on model are available, but * we're currently ignoring them. This leads to a * rare clustering bug that we eventually need to fix * properly [xref J3/32]: two different regions in one * profile HMM might have hit same seq domain, and * when we now go to calculate an OA trace, nothing * constrains us to find the two different alignments * to the HMM; in fact, because OA is optimal, we'll * find one and the *same* alignment, leading to an * apparent duplicate alignment in the output. * * Registered as #h74, Dec 2009, after EBI finds and * reports it. #h74 is worked around in p7_tophits.c * by hiding all but one envelope with an identical * alignment, in the rare event that this * happens. [xref J5/130]. */ ddef->nenvelopes++; if (glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i2, j2, TRUE, do_null2, FALSE) == eslOK) last_j2 = j2; } p7_spensemble_Reuse(ddef->sp); p7_trace_Reuse(ddef->tr); } else { /* The region looks simple, single domain; convert the region to an envelope. */ ddef->nenvelopes++; glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i, j, FALSE, do_null2, FALSE); } i = -1; triggered = FALSE; } } /* If profile was unihit upon entrance, we didn't modify its configuration (length nor mode), * else restore it to its original multihit mode, and to its original length model */ if (! save_mode_is_unihit) { p7_ReconfigMultihit(gm, saveL); } return eslOK; }