/* Function: p7_oprofile_ReconfigUnihit() * Synopsis: Quickly reconfig model into unihit mode for target length <L>. * Incept: MSF Tue Nov 3, 2009 [Janelia] * * Purpose: Given a profile <om> that's already been configured once, * quickly reconfigure it into a unihit mode for target * length <L>. * * This gets called in domain definition, when we need to * flip the model in and out of unihit <L=0> mode to * process individual domains. */ int p7_oprofile_ReconfigUnihit(P7_OPROFILE *om, int L) { return p7_ReconfigUnihit(om, L); }
/* Function: p7_domaindef_GlocalByPosteriorHeuristics() * Synopsis: Define glocal domains in a sequence using posterior probs. * Incept: EPN, Tue Oct 5 10:02:34 2010 * SRE, Sat Feb 23 08:17:44 2008 [Janelia] (p7_domaindef_ByPosteriorHeuristics()) * * Purpose: Given a sequence <sq> and model <gm> for which we have * already calculated a Forward and Backward parsing * matrices <gxf> and <gxb>; use posterior probability * heuristics to determine an annotated domain structure; * and for each domain found, score it (with null2 * calculations) and obtain an optimal accuracy alignment, * using <fwd> and <bck> matrices as workspace for the * necessary full-matrix DP calculations. Caller provides a * new or reused <ddef> object to hold these results. * * As a special case, if the profile is in unihit mode * upon entering, we don't ever modify its configuration. * This is especially important if this function is * being used within a search/scan pipeline with a * specially configured p7 profile in which N->N and/or * C->C transitions have been set to IMPOSSIBLE. (If * we were to call ReconfigLength() on such a profile * we would make those transitions possible.) * * One case in which profile reconfiguration is necessary * is when multiple domains are suspected. However, we * guard against this if the profile enters in unihit mode * by no allowing multiple domains (in fact, it should * never happen because J states are unreachable in unihit * profiles). If multiple domains are suspected in this case, * we return eslEINCONCEIVABLE. * * Upon return, <ddef> contains the definitions of all the * domains: their bounds, their null-corrected Forward * scores, and their optimal posterior accuracy alignments. * * <do_null2> is TRUE if we'll eventually apply a null2 * penalty FALSE if not. If FALSE, we can save time by * skipping Backward calls at some stages. * * Returns: <eslOK> on success. * * <eslERANGE> on numeric overflow in posterior * decoding. This should not be possible for multihit * models. * * <eslEINCONCEIVABLE> if profile enters as unihit but * multiple domains are suspected. */ int p7_domaindef_GlocalByPosteriorHeuristics(const ESL_SQ *sq, P7_PROFILE *gm, P7_GMX *gxf, P7_GMX *gxb, P7_GMX *fwd, P7_GMX *bck, P7_DOMAINDEF *ddef, int do_null2) { int i, j; int triggered; int d; int i2,j2; int last_j2; int nc; int saveL = gm->L; /* Save the length config of <om>; will restore upon return */ int save_mode = gm->mode; /* Likewise for the mode. */ int status; int save_mode_is_unihit; save_mode_is_unihit = (p7_IsMulti(save_mode)) ? FALSE : TRUE; /* if save_mode_is_unihit is TRUE, we never modify profile's configuration (length nor mode) */ if ((status = p7_domaindef_GrowTo(ddef, sq->n)) != eslOK) return status; /* ddef's btot,etot,mocc now ready for seq of length n */ /*printf("GDD P7 mode: %d\n", gm->mode);*/ if ((status = p7_GDomainDecoding(gm, gxf, gxb, ddef)) != eslOK) return status; /* ddef->{btot,etot,mocc} now made. */ /*printf("In p7_domaindef_GlocalByPosteriorHeuristics(): mode: %d rt1: %g rt2: %g rt3: %g nsamples: %d reseed: %d\n", save_mode, ddef->rt1, ddef->rt2, ddef->rt3, ddef->nsamples, ddef->do_reseeding);*/ esl_vec_FSet(ddef->n2sc, sq->n+1, 0.0); /* ddef->n2sc null2 scores are initialized */ ddef->nexpected = ddef->btot[sq->n]; /* posterior expectation for # of domains (same as etot[sq->n]) */ if(! save_mode_is_unihit) p7_ReconfigUnihit(gm, saveL); /* process each domain in unihit mode, regardless of gm->mode */ i = -1; triggered = FALSE; for (j = 1; j <= sq->n; j++) { /*printf("GDD j: %5d m: %.5f b: %8.3f e: %8.3f bhere: %8.3f ehere: %8.3f\n", j, ddef->mocc[j], ddef->btot[j], ddef->etot[j], ddef->btot[j] - ddef->btot[j-1], ddef->etot[j] - ddef->etot[j-1]); */ if (! triggered) { /* xref J2/101 for what the logic below is: */ if (ddef->mocc[j] - (ddef->btot[j] - ddef->btot[j-1]) < ddef->rt2) i = j; else if (i == -1) i = j; if (ddef->mocc[j] >= ddef->rt1) triggered = TRUE; } else if (ddef->mocc[j] - (ddef->etot[j] - ddef->etot[j-1]) < ddef->rt2) { /* We have a region i..j to evaluate. */ p7_gmx_GrowTo(fwd, gm->M, j-i+1); p7_gmx_GrowTo(bck, gm->M, j-i+1); ddef->nregions++; if (is_multidomain_region(ddef, i, j)) { if(save_mode_is_unihit) return eslEINCONCEIVABLE; /* This region appears to contain more than one domain, so we have to * resolve it by cluster analysis of posterior trace samples, to define * one or more domain envelopes. */ ddef->nclustered++; /* Resolve the region into domains by stochastic trace * clustering; assign position-specific null2 model by * stochastic trace clustering; there is redundancy * here; we will consolidate later if null2 strategy * works */ p7_ReconfigMultihit(gm, saveL); p7_GForward(sq->dsq+i-1, j-i+1, gm, fwd, NULL); glocal_region_trace_ensemble(ddef, gm, sq->dsq, i, j, fwd, bck, do_null2, &nc); p7_ReconfigUnihit(gm, saveL); /* ddef->n2sc is now set on i..j by the traceback-dependent method */ last_j2 = 0; for (d = 0; d < nc; d++) { p7_spensemble_GetClusterCoords(ddef->sp, d, &i2, &j2, NULL, NULL, NULL); if (i2 <= last_j2) ddef->noverlaps++; /* Note that k..m coords on model are available, but * we're currently ignoring them. This leads to a * rare clustering bug that we eventually need to fix * properly [xref J3/32]: two different regions in one * profile HMM might have hit same seq domain, and * when we now go to calculate an OA trace, nothing * constrains us to find the two different alignments * to the HMM; in fact, because OA is optimal, we'll * find one and the *same* alignment, leading to an * apparent duplicate alignment in the output. * * Registered as #h74, Dec 2009, after EBI finds and * reports it. #h74 is worked around in p7_tophits.c * by hiding all but one envelope with an identical * alignment, in the rare event that this * happens. [xref J5/130]. */ ddef->nenvelopes++; if (glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i2, j2, TRUE, do_null2, FALSE) == eslOK) last_j2 = j2; } p7_spensemble_Reuse(ddef->sp); p7_trace_Reuse(ddef->tr); } else { /* The region looks simple, single domain; convert the region to an envelope. */ ddef->nenvelopes++; glocal_rescore_isolated_domain(ddef, gm, sq, fwd, bck, i, j, FALSE, do_null2, FALSE); } i = -1; triggered = FALSE; } } /* If profile was unihit upon entrance, we didn't modify its configuration (length nor mode), * else restore it to its original multihit mode, and to its original length model */ if (! save_mode_is_unihit) { p7_ReconfigMultihit(gm, saveL); } return eslOK; }