/* The MSV score can be validated against Viterbi (provided we trust * Viterbi), by creating a multihit local profile in which: * 1. All t_MM scores = 0 * 2. All other core transitions = -inf * 3. All t_BMk entries uniformly log 2/(M(M+1)) */ static void utest_msv(ESL_GETOPTS *go, ESL_RANDOMNESS *r, ESL_ALPHABET *abc, P7_BG *bg, P7_PROFILE *gm, int nseq, int L) { P7_PROFILE *g2 = NULL; ESL_DSQ *dsq = NULL; P7_GMX *gx = NULL; float sc1, sc2; int k, idx; if ((dsq = malloc(sizeof(ESL_DSQ) *(L+2))) == NULL) esl_fatal("malloc failed"); if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("matrix creation failed"); if ((g2 = p7_profile_Clone(gm)) == NULL) esl_fatal("profile clone failed"); /* Make g2's scores appropriate for simulating the MSV algorithm in Viterbi */ esl_vec_FSet(g2->tsc, p7P_NTRANS * g2->M, -eslINFINITY); for (k = 1; k < g2->M; k++) p7P_TSC(g2, k, p7P_MM) = 0.0f; for (k = 0; k < g2->M; k++) p7P_TSC(g2, k, p7P_BM) = log(2.0f / ((float) g2->M * (float) (g2->M+1))); for (idx = 0; idx < nseq; idx++) { if (esl_rsq_xfIID(r, bg->f, abc->K, L, dsq) != eslOK) esl_fatal("seq generation failed"); if (p7_GMSV (dsq, L, gm, gx, 2.0, &sc1) != eslOK) esl_fatal("MSV failed"); if (p7_GViterbi(dsq, L, g2, gx, &sc2) != eslOK) esl_fatal("viterbi failed"); if (fabs(sc1-sc2) > 0.0001) esl_fatal("MSV score not equal to Viterbi score"); } p7_gmx_Destroy(gx); p7_profile_Destroy(g2); free(dsq); return; }
/* Function: p7_profile_SameAsMF() * Synopsis: Set a generic profile's scores to give MSV scores. * Incept: MSF Tue Nov 3, 2009 [Janelia] * * Purpose: Set a generic profile's scores so that the normal <dp_generic> DP * algorithms will give the same score as <p7_MSVFilter()>: * all t_MM scores = 0; all other core transitions = -inf; * multihit local mode; all <t_BMk> entries uniformly <log 2/(M(M+1))>; * <tCC, tNN, tJJ> scores 0; total approximated later as -3; * rounded in the same way as the 8-bit limited precision. * * Returns: <eslOK> on success. */ int p7_profile_SameAsMF(const P7_OPROFILE *om, P7_PROFILE *gm) { int k; float tbm = log(2.0f / ((float) gm->M * (float) (gm->M+1))); /* Transitions */ esl_vec_FSet(gm->tsc, p7P_NTRANS * gm->M, -eslINFINITY); for (k = 1; k < gm->M; k++) p7P_TSC(gm, k, p7P_MM) = 0.0f; for (k = 0; k < gm->M; k++) p7P_TSC(gm, k, p7P_BM) = tbm; return eslOK; }
/* sample_endpoints() * Incept: SRE, Mon Jan 22 10:43:20 2007 [Janelia] * * Purpose: Given a profile <gm> and random number source <r>, sample * a begin transition from the implicit probabilistic profile * model, yielding a sampled start and end node; return these * via <ret_kstart> and <ret_kend>. * * By construction, the entry at node <kstart> is into a * match state, but the exit from node <kend> might turn * out to be from either a match or delete state. * * We assume that exits j are uniformly distributed for a * particular entry point i: $a_{ij} =$ constant $\forall * j$. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation error. * * Xref: STL11/138 */ static int sample_endpoints(ESL_RANDOMNESS *r, const P7_PROFILE *gm, int *ret_kstart, int *ret_kend) { float *pstart = NULL; int k; int kstart, kend; int status; /* We have to backcalculate a probability distribution from the * lod B->Mk scores in a local model; this is a little time consuming, * but we don't have to do it often. */ ESL_ALLOC(pstart, sizeof(float) * (gm->M+1)); pstart[0] = 0.0f; for (k = 1; k <= gm->M; k++) pstart[k] = exp(p7P_TSC(gm, k-1, p7P_BM)) * (gm->M - k + 1); /* multiply p_ij by the number of exits j */ kstart = esl_rnd_FChoose(r, pstart, gm->M+1); /* sample the starting position from that distribution */ kend = kstart + esl_rnd_Roll(r, gm->M-kstart+1); /* and the exit uniformly from possible exits for it */ free(pstart); *ret_kstart = kstart; *ret_kend = kend; return eslOK; ERROR: if (pstart != NULL) free(pstart); *ret_kstart = 0; *ret_kend = 0; return status; }
/* Function: p7_oprofile_GetFwdTransitionArray() * Synopsis: Retrieve full 32-bit float transition probabilities from a * profile into a flat array * * Purpose: Extract an array of <type> (e.g. p7O_II) transition probabilities * from the underlying <om> profile. In SIMD implementations, * these are striped and interleaved, making them difficult to * directly access. Here, this is trivial. * * Args: <om> - optimized profile, containing transition information * <type> - transition type (e.g. p7O_II) * <arr> - preallocated array into which floats will be placed * * Returns: <eslOK> on success. * * Throws: (no abnormal error conditions) */ int p7_oprofile_GetFwdTransitionArray(const P7_OPROFILE *om, int type, float *arr ) { int i; for (i=0; i<om->M; i++) { arr[i] = exp(p7P_TSC(om, i, type)); } return eslOK; }
/* Function: p7_ProfileConfig() * Synopsis: Configure a search profile. * * Purpose: Given a model <hmm> with core probabilities, the null1 * model <bg>, a desired search <mode> (one of <p7_LOCAL>, * <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an * expected target sequence length <L>; configure the * search model in <gm> with lod scores relative to the * background frequencies in <bg>. * * Returns: <eslOK> on success; the profile <gm> now contains * scores and is ready for searching target sequences. * * Throws: <eslEMEM> on allocation error. */ int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode) { int k, x, z; /* counters over states, residues, annotation */ int status; float *occ = NULL; float *tp, *rp; float sc[p7_MAXCODE]; float Z; /* Contract checks */ if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match"); if (hmm->M > gm->allocM) ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM"); if (! (hmm->flags & p7H_CONS)) ESL_XEXCEPTION(eslEINVAL, "HMM must have a consensus to transfer to the profile"); /* Copy some pointer references and other info across from HMM */ gm->M = hmm->M; gm->max_length = hmm->max_length; gm->mode = mode; gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; if (gm->name != NULL) free(gm->name); if (gm->acc != NULL) free(gm->acc); if (gm->desc != NULL) free(gm->desc); if ((status = esl_strdup(hmm->name, -1, &(gm->name))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->acc, -1, &(gm->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->desc, -1, &(gm->desc))) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf); if (hmm->flags & p7H_MMASK) strcpy(gm->mm, hmm->mm); if (hmm->flags & p7H_CONS) strcpy(gm->consensus, hmm->consensus); /* must be present, actually, so the flag test is just for symmetry w/ other optional HMM fields */ if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs); for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z] = hmm->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) gm->compo[z] = hmm->compo[z]; /* Entry scores. */ if (p7_profile_IsLocal(gm)) { /* Local mode entry: occ[k] /( \sum_i occ[i] * (M-i+1)) * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0) */ Z = 0.; ESL_ALLOC(occ, sizeof(float) * (hmm->M+1)); if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR; for (k = 1; k <= hmm->M; k++) Z += occ[k] * (float) (hmm->M-k+1); for (k = 1; k <= hmm->M; k++) p7P_TSC(gm, k-1, p7P_BM) = log(occ[k] / Z); /* note off-by-one: entry at Mk stored as [k-1][BM] */ free(occ); } else /* glocal modes: left wing retraction; must be in log space for precision */ { Z = log(hmm->t[0][p7H_MD]); p7P_TSC(gm, 0, p7P_BM) = log(1.0 - hmm->t[0][p7H_MD]); for (k = 1; k < hmm->M; k++) { p7P_TSC(gm, k, p7P_BM) = Z + log(hmm->t[k][p7H_DM]); Z += log(hmm->t[k][p7H_DD]); } } /* E state loop/move probabilities: nonzero for MOVE allows loops/multihits * N,C,J transitions are set later by length config */ if (p7_profile_IsMultihit(gm)) { gm->xsc[p7P_E][p7P_MOVE] = -eslCONST_LOG2; gm->xsc[p7P_E][p7P_LOOP] = -eslCONST_LOG2; gm->nj = 1.0f; } else { gm->xsc[p7P_E][p7P_MOVE] = 0.0f; gm->xsc[p7P_E][p7P_LOOP] = -eslINFINITY; gm->nj = 0.0f; } /* Transition scores. */ for (k = 1; k < gm->M; k++) { tp = gm->tsc + k * p7P_NTRANS; tp[p7P_MM] = log(hmm->t[k][p7H_MM]); tp[p7P_MI] = log(hmm->t[k][p7H_MI]); tp[p7P_MD] = log(hmm->t[k][p7H_MD]); tp[p7P_IM] = log(hmm->t[k][p7H_IM]); tp[p7P_II] = log(hmm->t[k][p7H_II]); tp[p7P_DM] = log(hmm->t[k][p7H_DM]); tp[p7P_DD] = log(hmm->t[k][p7H_DD]); } /* Match emission scores. */ sc[hmm->abc->K] = -eslINFINITY; /* gap character */ sc[hmm->abc->Kp-2] = -eslINFINITY; /* nonresidue character */ sc[hmm->abc->Kp-1] = -eslINFINITY; /* missing data character */ for (k = 1; k <= hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log((double)hmm->mat[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k * p7P_NR; rp[p7P_MSC] = sc[x]; } } /* Insert emission scores */ /* SRE, Fri Dec 5 08:41:08 2008: We currently hardwire insert scores * to 0, i.e. corresponding to the insertion emission probabilities * being equal to the background probabilities. Benchmarking shows * that setting inserts to informative emission distributions causes * more problems than it's worth: polar biased composition hits * driven by stretches of "insertion" occur, and are difficult to * correct for. */ for (x = 0; x < gm->abc->Kp; x++) { for (k = 1; k < hmm->M; k++) p7P_ISC(gm, k, x) = 0.0f; p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ } for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->K) = -eslINFINITY; /* gap symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-2) = -eslINFINITY; /* nonresidue symbol */ for (k = 1; k <= hmm->M; k++) p7P_ISC(gm, k, gm->abc->Kp-1) = -eslINFINITY; /* missing data symbol */ #if 0 /* original (informative) insert setting: relies on sc[K, Kp-1] initialization to -inf above */ for (k = 1; k < hmm->M; k++) { for (x = 0; x < hmm->abc->K; x++) sc[x] = log(hmm->ins[k][x] / bg->f[x]); esl_abc_FExpectScVec(hmm->abc, sc, bg->f); for (x = 0; x < hmm->abc->Kp; x++) { rp = gm->rsc[x] + k*p7P_NR; rp[p7P_ISC] = sc[x]; } } for (x = 0; x < hmm->abc->Kp; x++) p7P_ISC(gm, hmm->M, x) = -eslINFINITY; /* init I_M to impossible. */ #endif /* Remaining specials, [NCJ][MOVE | LOOP] are set by ReconfigLength() */ gm->L = 0; /* force ReconfigLength to reconfig */ if ((status = p7_ReconfigLength(gm, L)) != eslOK) goto ERROR; return eslOK; ERROR: if (occ != NULL) free(occ); return status; }
/* Function: p7_ProfileConfig() * Synopsis: Configure a search profile. * Incept: SRE, Sun Sep 25 12:21:25 2005 [St. Louis] * * Purpose: Given a model <hmm> with core probabilities, the null1 * model <bg>, a desired search <mode> (one of <p7_LOCAL>, * <p7_GLOCAL>, <p7_UNILOCAL>, or <p7_UNIGLOCAL>), and an * expected target sequence length <L>; configure the * search model in <gm> with lod scores relative to the * background frequencies in <bg>. * * Returns: <eslOK> on success; the profile <gm> now contains * scores and is ready for searching target sequences. * * Throws: <eslEMEM> on allocation error. */ int p7_ProfileConfig(const P7_HMM *hmm, const P7_BG *bg, P7_PROFILE *gm, int L, int mode) { int k, x, z; /* counters over states, residues, annotation */ int status; float *occ = NULL; float *tp, *rp; float sc[p7_MAXCODE]; float mthresh; float Z; /* Contract checks */ if (gm->abc->type != hmm->abc->type) ESL_XEXCEPTION(eslEINVAL, "HMM and profile alphabet don't match"); if (hmm->M > gm->allocM) ESL_XEXCEPTION(eslEINVAL, "profile too small to hold HMM"); /* Copy some pointer references and other info across from HMM */ gm->M = hmm->M; gm->mode = mode; gm->roff = -1; gm->eoff = -1; gm->offs[p7_MOFFSET] = -1; gm->offs[p7_FOFFSET] = -1; gm->offs[p7_POFFSET] = -1; if (gm->name != NULL) free(gm->name); if (gm->acc != NULL) free(gm->acc); if (gm->desc != NULL) free(gm->desc); if ((status = esl_strdup(hmm->name, -1, &(gm->name))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->acc, -1, &(gm->acc))) != eslOK) goto ERROR; if ((status = esl_strdup(hmm->desc, -1, &(gm->desc))) != eslOK) goto ERROR; if (hmm->flags & p7H_RF) strcpy(gm->rf, hmm->rf); if (hmm->flags & p7H_CS) strcpy(gm->cs, hmm->cs); for (z = 0; z < p7_NEVPARAM; z++) gm->evparam[z] = hmm->evparam[z]; for (z = 0; z < p7_NCUTOFFS; z++) gm->cutoff[z] = hmm->cutoff[z]; for (z = 0; z < p7_MAXABET; z++) gm->compo[z] = hmm->compo[z]; /* Determine the "consensus" residue for each match position. * This is only used for alignment displays, not in any calculations. */ if (hmm->abc->type == eslAMINO) mthresh = 0.5; else if (hmm->abc->type == eslDNA) mthresh = 0.9; else if (hmm->abc->type == eslRNA) mthresh = 0.9; else mthresh = 0.5; gm->consensus[0] = ' '; for (k = 1; k <= hmm->M; k++) { x = esl_vec_FArgMax(hmm->mat[k], hmm->abc->K); gm->consensus[k] = ((hmm->mat[k][x] > mthresh) ? toupper(hmm->abc->sym[x]) : tolower(hmm->abc->sym[x])); } gm->consensus[hmm->M+1] = '\0'; /* Entry scores. */ if (p7_profile_IsLocal(gm)) { /* Local mode entry: occ[k] /( \sum_i occ[i] * (M-i+1)) * (Reduces to uniform 2/(M(M+1)) for occupancies of 1.0) */ Z = 0.; ESL_ALLOC_WITH_TYPE(occ, float*, sizeof(float) * (hmm->M+1)); if ((status = p7_hmm_CalculateOccupancy(hmm, occ, NULL)) != eslOK) goto ERROR; for (k = 1; k <= hmm->M; k++) Z += occ[k] * (float) (hmm->M-k+1); for (k = 1; k <= hmm->M; k++) p7P_TSC(gm, k-1, p7P_BM) = log((double)(occ[k] / Z)); /* note off-by-one: entry at Mk stored as [k-1][BM] */ free(occ); } else /* glocal modes: left wing retraction; must be in log space for precision */ {