/* Function: CPlan9SWConfig() * EPN 05.30.06 * based on SRE's Plan7SWConfig() from HMMER's plan7.c * * Purpose: Set the alignment independent parameters of * a CM Plan 9 model to hmmsw (Smith/Waterman) configuration. * * Notes: The desideratum for begin/end probs is that all fragments ij * (starting at match i, ending at match j) are * equiprobable -- there is no information in the choice of * entry/exit. There are M(M+1)/2 possible choices of ij, so * each must get a probability of 2/M(M+1). This prob is the * product of a begin, an end, and all the not-end probs in * the path between i,j. * * Thus: entry/exit is asymmetric because of the left/right * nature of the HMM/profile. Entry probability is distributed * simply by assigning p_x = pentry / (M-1) to M-1 * internal match states. However, the same approach doesn't * lead to a flat distribution over exit points. Exit p's * must be corrected for the probability of a previous exit * from the model. Requiring a flat distribution over exit * points leads to an easily solved piece of algebra, giving: * p_1 = pexit / (M-1) * p_x = p_1 / (1 - (x-1) p_1) * * Modified EPN, Thu Feb 7 15:54:16 2008, as follows: * To better match a locally configured CM, if <do_match_local_cm> * we disallow insertions before the first (emitting) match state, * (from I_0), and after the final (emitting) match state, * (from I_M). I_0 maps to ROOT_IL and I_M maps to ROOT_IR * which can never be entered in a locally configured CM * (b/c the ROOT_S state MUST jump into a local begin state, which * are always match states>). Also we disallow a M_0->D_1 transition * because these would be impossible in a locally configured CM. * * <do_match_local_cm> is usually TRUE, unless we're configuring * the CP9 specifically for eventual sub CM alignment, where * the goal is simply find the most likely start/end point * of the alignment with this CP9 (in that case we want * I_0 and I_M reachable). * * Args: hmm - the CM Plan 9 model w/ data-dep prob's valid * pentry - probability of an internal entry somewhere; * will be evenly distributed over M-1 match states * pexit - probability of an internal exit somewhere; * will be distributed over M-1 match states. * do_match_local_cm - TRUE to make I_0, D_1 and I_M unreachable * to better match a locally configured CM. * first_cm_ndtype - only used if do_match_local_cm is TRUE * if it's MATL or MATP then D_1 should be unreachable (it is in the CM) * if it's MATR or MATP then D_M should be unreachable (it is in the CM) * * Return: (void) * HMM probabilities are modified. */ void CPlan9SWConfig(CP9_t *hmm, float pentry, float pexit, int do_match_local_cm, int first_cm_ndtype) { float basep; /* p1 for exits: the base p */ int k; /* counter over states */ float d; /* No special (*x* states in Plan 7) states in CM Plan 9 */ /*for (k = 1; k <= hmm->M; k++) printf("before anything: end[%d]: %f\n", k, hmm->end[k]);*/ /* Configure entry. */ if(do_match_local_cm) { hmm->t[0][CTMI] = 0.; hmm->t[0][CTMM] = 0.; /* already was 0.0, transition from M_0 to M_1 is begin[1] */ hmm->t[0][CTMEL] = 0.; /* already was 0.0, can never do a local end from M_0 */ if((first_cm_ndtype == MATL_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_1, make D_1 unreachable too */ hmm->t[0][CTMD] = 0.; } hmm->t[hmm->M][CTMI] = 0.; hmm->t[hmm->M][CTDI] = 0.; if((first_cm_ndtype == MATR_nd) || (first_cm_ndtype == MATP_nd)) { /* CM can't possibly reach the CM delete state that maps to D_M, make D_M unreachable too */ hmm->t[hmm->M][CTMD] = 0.; } /* renormalize transitions out of M_M */ d = esl_vec_FSum(hmm->t[hmm->M], cp9_TRANS_NMATCH) + hmm->end[hmm->M]; esl_vec_FScale(hmm->t[hmm->M], cp9_TRANS_NMATCH, 1./d); hmm->end[hmm->M] /= d; /* renormalize transitions out of D_M */ esl_vec_FNorm(hmm->t[hmm->M] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } hmm->begin[1] = (1. - pentry) * (1. - (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL])); esl_vec_FSet(hmm->begin+2, hmm->M-1, (pentry * (1.- (hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]))) / (float)(hmm->M-1)); /* note: hmm->t[0][CTMEL] == 0. (can't locally end from begin) * and if do_match_local_cm, hmm->t[0][CTMI] and hmm->t[0][CTMD] were just set to 0. */ /* Configure exit. * Don't touch hmm->end[hmm->M] */ basep = pexit / (float) (hmm->M-1); for (k = 1; k < hmm->M; k++) hmm->end[k] = basep / (1. - basep * (float) (k-1)); CPlan9RenormalizeExits(hmm, 1); /*for (k = 1; k <= hmm->M; k++) printf("after renormalizing: end[%d]: %f\n", k, hmm->end[k]);*/ hmm->flags &= ~CPLAN9_HASBITS; /* reconfig invalidates log-odds scores */ hmm->flags |= CPLAN9_LOCAL_BEGIN; /* local begins now on */ hmm->flags |= CPLAN9_LOCAL_END; /* local ends now on */ CP9Logoddsify(hmm); }
void esl_vec_FNorm(float *vec, int n) { int x; float sum; sum = esl_vec_FSum(vec, n); if (sum != 0.0) for (x = 0; x < n; x++) vec[x] /= sum; else for (x = 0; x < n; x++) vec[x] = 1. / (float) n; }
/* Function: CPlan9Renormalize() * * Purpose: Take an HMM in counts form, and renormalize * all of its probability vectors. Also enforces * CM Plan9 restrictions on nonexistent transitions. * * Args: hmm - the model to renormalize. * * Return: (void) * hmm is changed. */ void CPlan9Renormalize(CP9_t *hmm) { int k; /* counter for model position */ float d; /* denominator */ /* match emissions */ esl_vec_FSet(hmm->mat[0], hmm->abc->K, 0.); /*M_0 is B state, non-emitter*/ for (k = 1; k <= hmm->M; k++) esl_vec_FNorm(hmm->mat[k], hmm->abc->K); /* insert emissions */ for (k = 0; k <= hmm->M; k++) esl_vec_FNorm(hmm->ins[k], hmm->abc->K); /* begin transitions */ d = esl_vec_FSum(hmm->begin+1, hmm->M) + hmm->t[0][CTMI] + hmm->t[0][CTMD] + hmm->t[0][CTMEL]; /* hmm->t[0][CTMEL] should always be 0., can't local end from the M_0 == B state */ esl_vec_FScale(hmm->begin+1, hmm->M, 1./d); hmm->t[0][CTMI] /= d; hmm->t[0][CTMD] /= d; hmm->t[0][CTMEL] /= d; esl_vec_FNorm(hmm->t[0] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* transitions out of insert for node 0 (state N)*/ esl_vec_FSet (hmm->t[0] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE, 0.); /* main model transitions */ for (k = 1; k <= hmm->M; k++) /* safe for node M too, hmm->t[hmm->M][CTMM] should be 0.*/ { d = esl_vec_FSum(hmm->t[k], cp9_TRANS_NMATCH) + hmm->end[k]; esl_vec_FScale(hmm->t[k], cp9_TRANS_NMATCH, 1./d); hmm->end[k] /= d; esl_vec_FNorm(hmm->t[k] + cp9_TRANS_INSERT_OFFSET, cp9_TRANS_NINSERT); /* insert */ esl_vec_FNorm(hmm->t[k] + cp9_TRANS_DELETE_OFFSET, cp9_TRANS_NDELETE); /* delete */ } /* null model emissions */ esl_vec_FNorm(hmm->null, hmm->abc->K); hmm->flags &= ~CPLAN9_HASBITS; /* clear the log-odds ready flag */ hmm->flags |= CPLAN9_HASPROB; /* set the probabilities OK flag */ }
/* Note that calculate_occupancy has moved to p7_hmm.c, but * unit tests over there aren't hooked up yet; so leave a copy of the unit test * here for now. */ static void utest_occupancy(P7_HMM *hmm) { char *msg = "modelconfig.c::calculate_occupancy() unit test failed"; float *occ; float x; occ = malloc(sizeof(float) * (hmm->M+1)); p7_hmm_CalculateOccupancy(hmm, occ, NULL); x = esl_vec_FSum(occ+1, hmm->M) / (float) hmm->M; if (esl_FCompare(x, 0.6, 0.1) != eslOK) esl_fatal(msg); free(occ); return; }
static void utest_correct_normalization(ESL_RANDOMNESS *r, P7_PROFILE *gm, P7_BG *bg, ESL_DSQ *dsq, int L, P7_GMX *fwd, P7_GMX *bck) { char *msg = "normalization unit test failed"; float null2[p7_MAXABET]; float sum; int x; esl_rsq_xfIID(r, bg->f, gm->abc->K, L, dsq); /* sample a random digital seq of length L */ p7_GForward (dsq, L, gm, fwd, NULL); p7_GBackward(dsq, L, gm, bck, NULL); p7_PosteriorNull2(L, gm, fwd, bck, bck); /* <bck> now contains posterior probs */ p7_Null2Corrections(gm, dsq, L, 0, bck, fwd, null2, NULL, NULL); /* use <fwd> as workspace */ /* Convert null2 from lod score to frequencies f_d */ for (x = 0; x < gm->abc->K; x++) null2[x] = exp(null2[x]) * bg->f[x]; sum = esl_vec_FSum(null2, gm->abc->K); if (sum < 0.99 || sum > 1.01) esl_fatal(msg); }
/* Function: CPlan9RenormalizeExits() * EPN 05.30.06 based on SRE's Plan7RenormalizeExits() from * HMMER's plan7.c. * * Date: SRE, Fri Aug 14 11:22:19 1998 [St. Louis] * * Purpose: Renormalize just the match state transitions; * for instance, after a Config() function has * modified the exit distribution. * * Args: hmm - hmm to renormalize * spos - first consensus column modelled by original * CP9 HMM the sub CP9 HMM models. Often 1. * Returns: void */ void CPlan9RenormalizeExits(CP9_t *hmm, int spos) { int k; float d; /* We can't exit from node 0 so we start renormalizing at node 1 */ for (k = 1; k < hmm->M; k++) { if(k != (spos-1)) /* we can't exit from the M_spos-1 */ { d = esl_vec_FSum(hmm->t[k], 4); /* esl_vec_FScale(hmm->t[k], 4, 1./(d + d*hmm->end[k])); */ esl_vec_FScale(hmm->t[k], 4, (1.-hmm->end[k])/d); } } /* Take care of hmm->M node, which is special */ d = hmm->t[hmm->M][CTMI] + hmm->t[hmm->M][CTMEL]; /* CTMD is IMPOSSIBLE, CTMM is hmm->end[hmm-M] */ if(! (fabs(d-0.) < eslSMALLX1)) { /* don't divide by d if it's zero */ hmm->t[hmm->M][CTMI] *= (1.-hmm->end[hmm->M])/d; hmm->t[hmm->M][CTMEL] *= (1.-hmm->end[hmm->M])/d; } return; }
/* Function: p7_bg_Read() * Synopsis: Read background frequencies from a file. * * Purpose: Read new background frequencies from file <bgfile>, * overwriting the frequencies previously in the * <P7_BG> object <bg>. * * Note that <bg> is already created by the caller, not * created here. Also note that <p7_bg_Read()> only reads * residue background frequencies used for the "null * model", whereas a <P7_BG> object contains additional * information for the bias filter and for the biased * composition correction. * * Args: bgfile - file to read. * bg - existing <P7_BG> object provided by the caller. * errbuf - OPTIONAL: space for an error message, upon parse errors; or NULL. * * Returns: <eslOK> on success, and background frequencies in <bg> * are overwritten. * * <eslENOTFOUND> if <bgfile> can't be opened for reading. * <eslEFORMAT> if parsing of <bgfile> fails for some * reason. In both cases, <errbuf> contains a * user-directed error message upon return, including (if * relevant) the file name <bgfile> and the line number on * which an error was detected. <bg> is unmodified. * * Throws: <eslEMEM> on allocation failure; <bg> is unmodified, * and <errbuf> is empty. */ int p7_bg_Read(char *bgfile, P7_BG *bg, char *errbuf) { ESL_FILEPARSER *efp = NULL; float *fq = NULL; int n = 0; char *tok; int toklen; int alphatype; ESL_DSQ x; int status; if (errbuf) errbuf[0] = '\0'; status = esl_fileparser_Open(bgfile, NULL, &efp); if (status == eslENOTFOUND) ESL_XFAIL(eslENOTFOUND, errbuf, "couldn't open bg file %s for reading", bgfile); else if (status != eslOK) goto ERROR; esl_fileparser_SetCommentChar(efp, '#'); /* First token is alphabet type: amino | DNA | RNA */ status = esl_fileparser_GetToken(efp, &tok, &toklen); if (status == eslEOF) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; alphatype = esl_abc_EncodeType(tok); if (alphatype == eslUNKNOWN) ESL_XFAIL(eslEFORMAT, errbuf, "expected alphabet type but saw \"%s\" [line %d of bgfile %s]", tok, efp->linenumber, bgfile); else if (alphatype != bg->abc->type) ESL_XFAIL(eslEFORMAT, errbuf, "bg file's alphabet is %s; expected %s [line %d, %s]", tok, esl_abc_DecodeType(bg->abc->type), efp->linenumber, bgfile); ESL_ALLOC(fq, sizeof(float) * bg->abc->K); esl_vec_FSet(fq, bg->abc->K, -1.0); while ((status = esl_fileparser_NextLine(efp)) == eslOK) { status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file [line %d of bgfile %s", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (toklen != 1 || ! esl_abc_CIsCanonical(bg->abc, *tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected to parse a residue letter; saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); x = esl_abc_DigitizeSymbol(bg->abc, *tok); if (fq[x] != -1.0) ESL_XFAIL(eslEFORMAT, errbuf, "already parsed probability of %c [line %d of bgfile %s]", bg->abc->sym[x], efp->linenumber, bgfile); n++; status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslEOL) ESL_XFAIL(eslEFORMAT, errbuf, "premature end of file, expected a probability [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslOK) goto ERROR; if (! esl_str_IsReal(tok)) ESL_XFAIL(eslEFORMAT, errbuf, "expected a probability, saw %s [line %d of bgfile %s]", tok, efp->linenumber, bgfile); fq[x] = atof(tok); status = esl_fileparser_GetTokenOnLine(efp, &tok, &toklen); if (status == eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "extra unexpected data found [line %d of bgfile %s]", efp->linenumber, bgfile); else if (status != eslEOL) goto ERROR; } if (status != eslEOF) goto ERROR; if ( n != bg->abc->K) ESL_XFAIL(eslEFORMAT, errbuf, "expected %d residue frequencies, but found %d in bgfile %s", bg->abc->K, n, bgfile); if ( esl_FCompare(esl_vec_FSum(fq, bg->abc->K), 1.0, 0.001) != eslOK) ESL_XFAIL(eslEFORMAT, errbuf, "residue frequencies do not sum to 1.0 in bgfile %s", bgfile); /* all checking complete. no more error cases. overwrite bg with the new frequencies */ esl_vec_FNorm(fq, bg->abc->K); esl_vec_FCopy(fq, bg->abc->K, bg->f); free(fq); esl_fileparser_Close(efp); return eslOK; ERROR: if (fq) free(fq); if (efp) esl_fileparser_Close(efp); return status; }