static void utest_FLogsumSpecials(void) { char *msg = "logsum specials unit test failed"; if (p7_FLogsum(0.0, -eslINFINITY) != 0.0) esl_fatal(msg); if (p7_FLogsum(-eslINFINITY, 0.0) != 0.0) esl_fatal(msg); if (p7_FLogsum(-eslINFINITY, -eslINFINITY) != -eslINFINITY) esl_fatal(msg); }
/* Function: p7_FLogsumError() * Synopsis: Compute absolute error in probability from Logsum. * * Purpose: Compute the absolute error in probability space * resulting from <p7_FLogsum()>'s table lookup * approximation: approximation result - exact result. * * This is of course computable analytically for * any <a,b> given <p7_LOGSUM_TBL>; but the function * is useful for some routines that want to determine * if <p7_FLogsum()> has been compiled in its * exact slow mode for debugging purposes. Testing * <p7_FLogsumError(-0.4, -0.5) > 0.0001> * for example, suffices to detect that the function * is compiled in its fast approximation mode given * the defaults. */ float p7_FLogsumError(float a, float b) { float approx = p7_FLogsum(a,b); float exact = log(exp(a) + exp(b)); return (exp(approx) - exp(exact)); }
int main(int argc, char **argv) { ESL_GETOPTS *go = p7_CreateDefaultApp(options, 0, argc, argv, banner, usage); ESL_RANDOMNESS *r = esl_randomness_CreateFast(esl_opt_GetInteger(go, "-s")); ESL_STOPWATCH *w = esl_stopwatch_Create(); int N = esl_opt_GetInteger(go, "-N"); int i; float *A, *B, *C; p7_FLogsumInit(); /* Create the problem: sample N values A,B on interval -1000,1000: about the range of H3 scores */ A = malloc(sizeof(float) * N); B = malloc(sizeof(float) * N); C = malloc(sizeof(float) * N); for (i = 0; i < N; i++) { A[i] = esl_random(r) * 2000. - 1000.; B[i] = esl_random(r) * 2000. - 1000.; } /* Run */ esl_stopwatch_Start(w); if (esl_opt_GetBoolean(go, "-n")) { for (i = 0; i < N; i++) C[i] = naive2(A[i], B[i]); } else if (esl_opt_GetBoolean(go, "-r")) { for (i = 0; i < N; i++) C[i] = naive1(A[i], B[i]); } else { for (i = 0; i < N; i++) C[i] = p7_FLogsum(A[i], B[i]); } esl_stopwatch_Stop(w); esl_stopwatch_Display(stdout, w, "# CPU time: "); esl_stopwatch_Destroy(w); esl_randomness_Destroy(r); esl_getopts_Destroy(go); return 0; }
int main(int argc, char **argv) { float a = atof(argv[1]); float b = atof(argv[2]); float result; p7_FLogsumInit(); result = p7_FLogsum(a, b); printf("p7_FLogsum(%f,%f) = %f\n", a, b, result); result = log(exp(a) + exp(b)); printf("log(e^%f + e^%f) = %f\n", a, b, result); printf("Absolute error in probability: %f\n", p7_FLogsumError(a,b)); return eslOK; }
static void utest_FLogsumError(ESL_GETOPTS *go, ESL_RANDOMNESS *r) { int N = esl_opt_GetInteger(go, "-N"); float maxval = esl_opt_GetReal(go, "-S"); int be_verbose = esl_opt_GetBoolean(go, "-v"); float maxerr = 0.0; float avgerr = 0.0; int i; float a,b,result,exact,err; for (i = 0; i < N; i++) { a = (esl_random(r) - 0.5) * maxval * 2.; /* uniform draws on -maxval..maxval */ b = (esl_random(r) - 0.5) * maxval * 2.; exact = log(exp(a) + exp(b)); result = p7_FLogsum(a,b); err = fabs(exact-result) / maxval; avgerr += err; maxerr = ESL_MAX(maxerr, err); if (be_verbose) printf("%8.4f %8.4f %8.4f %8.4f %8.4f\n", a, b, exact, result, err); } avgerr /= (float) N; if (be_verbose) { printf("average error = %f\n", avgerr); printf("max error = %f\n", maxerr); } if (maxerr > 0.0001) esl_fatal("maximum error of %f is too high: logsum unit test fails", maxerr); if (avgerr > 0.0001) esl_fatal("average error of %f is too high: logsum unit test fails", avgerr); }
/* Function: p7_null3_score() * * Purpose: Calculate a correction (in log_2 odds) to be applied * to a sequence, using a null model based on the * composition of the target sequence. * The null model is constructed /post hoc/ as the * distribution of the target sequence; if the target * sequence is 40% A, 5% C, 5% G, 40% T, then the null * model is (0.4, 0.05, 0.05, 0.4). This function is * based heavily on Infernal's ScoreCorrectionNull3(), * with two important changes: * - it leaves the log2 conversion from NATS to BITS * for the calling function. * - it doesn't include the omega score modifier * (based on prior probability of using the null3 * model), again leaving this to the calling function. * * Args: abc - alphabet for hit (only used to get alphabet size) * dsq - the sequence the hit resides in * tr - trace of the alignment, used to find the match states * (non-match chars are ignored in computing freq, not used if NULL) * start - start position of hit in dsq * stop - end position of hit in dsq * bg - background, used for the default null model's emission freq * ret_sc - RETURN: the correction to the score (in NATS); * caller subtracts this from hit score to get * corrected score. * Return: void, ret_sc: the log-odds score correction (in NATS). */ void p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc) { float score = 0.; int status; int i; float *freq; int dir; int tr_pos; ESL_ALLOC(freq, sizeof(float) * abc->K); esl_vec_FSet(freq, abc->K, 0.0); /* contract check */ if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", ""); if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", ""); if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", ""); dir = start < stop ? 1 : -1; if (tr != NULL) { /* skip the parts of the trace that precede the first match state */ tr_pos = 2; i = start; while (tr->st[tr_pos] != p7T_M) { if (tr->st[tr_pos] == p7T_N) i += dir; tr_pos++; } /* tally frequencies from characters hitting match state*/ while (tr->st[tr_pos] != p7T_E) { if (tr->st[tr_pos] == p7T_M) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } if (tr->st[tr_pos] != p7T_D ) i += dir; tr_pos++; } } else { /* tally frequencies from the full envelope */ for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } } esl_vec_FNorm(freq, abc->K); /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/ for (i = 0; i < abc->K; i++) score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ; /* Return the correction to the bit score. */ score = p7_FLogsum(0., score); *ret_sc = score; return; ERROR: esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", ""); return; /* never reached */ }
/* Function: p7_GNull2_ByExpectation() * Synopsis: Calculate null2 model from posterior probabilities. * Incept: SRE, Thu Feb 28 09:52:28 2008 [Janelia] * * Purpose: Calculate the "null2" model for the envelope encompassed * by a posterior probability calculation <pp> for model * <gm>. Return the null2 odds emission probabilities * $\frac{f'{x}}{f{x}}$ in <null2>, which caller * provides as space for at least <alphabet->Kp> residues. * * The expectation method is applied to envelopes in * simple, well resolved regions (regions containing just a * single envelope, where no stochastic traceback * clustering was required). * * Make sure that the posterior probability matrix <pp> has * been calculated by the caller for only the envelope; thus * its rows are numbered <1..Ld>, for envelope <ienv..jenv> * of length <Ld=jenv-ienv+1>. * * Args: gm - profile, in any mode, target length model set to <L> * pp - posterior prob matrix, for <gm> against domain envelope <dsq+i-1> (offset) * null2 - RETURN: null2 odds ratios per residue; <0..Kp-1>; caller allocated space * * Returns: <eslOK> on success; <null2> contains the null2 scores. The 0 * row of <pp> has been used as temp space, and happens to contain * the expected frequency that each M,I,N,C,J state is used in this * <pp> matrix to generate residues. * * Throws: (no abnormal error conditions) */ int p7_GNull2_ByExpectation(const P7_PROFILE *gm, P7_GMX *pp, float *null2) { int M = gm->M; int Ld = pp->L; float **dp = pp->dp; float *xmx = pp->xmx; float xfactor; int x; /* over symbols 0..K-1 */ int i; /* over offset envelope dsq positions 1..Ld */ int k; /* over model M states 1..M, I states 1..M-1 */ /* Calculate expected # of times that each emitting state was used * in generating the Ld residues in this domain. * The 0 row in <wrk> is used to hold these numbers. */ esl_vec_FCopy(pp->dp[1], (M+1)*p7G_NSCELLS, pp->dp[0]); esl_vec_FCopy(pp->xmx+p7G_NXCELLS, p7G_NXCELLS, pp->xmx); for (i = 2; i <= Ld; i++) { esl_vec_FAdd(pp->dp[0], pp->dp[i], (M+1)*p7G_NSCELLS); esl_vec_FAdd(pp->xmx, pp->xmx+i*p7G_NXCELLS, p7G_NXCELLS); } /* Convert those expected #'s to log frequencies; these we'll use as * the log posterior weights. */ esl_vec_FLog(pp->dp[0], (M+1)*p7G_NSCELLS); esl_vec_FLog(pp->xmx, p7G_NXCELLS); esl_vec_FIncrement(pp->dp[0], (M+1)*p7G_NSCELLS, -log((float)Ld)); esl_vec_FIncrement(pp->xmx, p7G_NXCELLS, -log((float)Ld)); /* Calculate null2's log odds emission probabilities, by taking * posterior weighted sum over all emission vectors used in paths * explaining the domain. * This is dog-slow; a point for future optimization. */ xfactor = XMX(0,p7G_N); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_C)); xfactor = p7_FLogsum(xfactor, XMX(0,p7G_J)); esl_vec_FSet(null2, gm->abc->K, -eslINFINITY); for (x = 0; x < gm->abc->K; x++) { for (k = 1; k < M; k++) { null2[x] = p7_FLogsum(null2[x], MMX(0,k) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], IMX(0,k) + p7P_ISC(gm, k, x)); } null2[x] = p7_FLogsum(null2[x], MMX(0,M) + p7P_MSC(gm, k, x)); null2[x] = p7_FLogsum(null2[x], xfactor); } esl_vec_FExp (null2, gm->abc->K); /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet, * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies * for this envelope. */ /* make valid scores for all degeneracies, by averaging the odds ratios. */ esl_abc_FAvgScVec(gm->abc, null2); /* does not set gap, nonres, missing */ null2[gm->abc->K] = 1.0; /* gap character */ null2[gm->abc->Kp-2] = 1.0; /* nonresidue "*" */ null2[gm->abc->Kp-1] = 1.0; /* missing data "~" */ return eslOK; }
/* Function: p7_GForward() * Synopsis: The Forward algorithm. * Incept: SRE, Mon Apr 16 13:57:35 2007 [Janelia] * * Purpose: The Forward dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <gm->M> * by <L> cells; calculate the probability of the sequence * given the model using the Forward algorithm; return the * Forward matrix in <gx>, and the Forward score in <ret_sc>. * * The Forward score is in lod score form. To convert to a * bitscore, the caller needs to subtract a null model lod * score, then convert to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Forward lod score in nats * * Return: <eslOK> on success. */ int p7_GForward(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i, k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Initialization of the zero row, and the lookup table of the log * sum routine. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ p7_FLogsumInit(); /* Recursion. Done as a pull. * Note some slightly wasteful boundary conditions: * tsc[0] = impossible for all eight transitions (no node 0) * D_1 is wastefully calculated (doesn't exist) */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i, p7G_E) = -eslINFINITY; for (k = 1; k < M; k++) { /* match state */ sc = p7_FLogsum(p7_FLogsum(MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)), p7_FLogsum(XMX(i-1,p7G_B) + TSC(p7P_BM,k-1), DMX(i-1,k-1) + TSC(p7P_DM,k-1))); MMX(i,k) = sc + MSC(k); /* insert state */ sc = p7_FLogsum(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = p7_FLogsum(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); /* E state update */ XMX(i,p7G_E) = p7_FLogsum(p7_FLogsum(MMX(i,k) + esc, DMX(i,k) + esc), XMX(i,p7G_E)); } /* unrolled match state M_M */ sc = p7_FLogsum(p7_FLogsum(MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)), p7_FLogsum(XMX(i-1,p7G_B) + TSC(p7P_BM,M-1), DMX(i-1,M-1) + TSC(p7P_DM,M-1))); MMX(i,M) = sc + MSC(M); IMX(i,M) = -eslINFINITY; /* unrolled delete state D_M */ DMX(i,M) = p7_FLogsum(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* unrolled E state update */ XMX(i,p7G_E) = p7_FLogsum(p7_FLogsum(MMX(i,M), DMX(i,M)), XMX(i,p7G_E)); /* J state */ XMX(i,p7G_J) = p7_FLogsum(XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* C state */ XMX(i,p7G_C) = p7_FLogsum(XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP], XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ XMX(i,p7G_B) = p7_FLogsum(XMX(i, p7G_N) + gm->xsc[p7P_N][p7P_MOVE], XMX(i, p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); } if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + gm->xsc[p7P_C][p7P_MOVE]; gx->M = M; gx->L = L; return eslOK; }
/* Function: p7_GBackward() * Synopsis: The Backward algorithm. * Incept: SRE, Fri Dec 28 14:31:58 2007 [Janelia] * * Purpose: The Backward dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <gm->M> * by <L> cells; calculate the probability of the sequence * given the model using the Backward algorithm; return the * Backward matrix in <gx>, and the Backward score in <ret_sc>. * * The Backward score is in lod score form. To convert to a * bitscore, the caller needs to subtract a null model lod * score, then convert to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Backward lod score in nats * * Return: <eslOK> on success. */ int p7_GBackward(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float const *rsc = NULL; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i, k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Note: backward calculates the probability we can get *out* of * cell i,k; exclusive of emitting residue x_i. */ p7_FLogsumInit(); /* Initialize the L row. */ XMX(L,p7G_J) = XMX(L,p7G_B) = XMX(L,p7G_N) = -eslINFINITY; XMX(L,p7G_C) = gm->xsc[p7P_C][p7P_MOVE]; /* C<-T */ XMX(L,p7G_E) = XMX(L,p7G_C) + gm->xsc[p7P_E][p7P_MOVE]; /* E<-C, no tail */ MMX(L,M) = DMX(L,M) = XMX(L,p7G_E); /* {MD}_M <- E (prob 1.0) */ IMX(L,M) = -eslINFINITY; /* no I_M state */ for (k = M-1; k >= 1; k--) { MMX(L,k) = p7_FLogsum( XMX(L,p7G_E) + esc, DMX(L, k+1) + TSC(p7P_MD,k)); DMX(L,k) = p7_FLogsum( XMX(L,p7G_E) + esc, DMX(L, k+1) + TSC(p7P_DD,k)); IMX(L,k) = -eslINFINITY; } /* Main recursion */ for (i = L-1; i >= 1; i--) { rsc = gm->rsc[dsq[i+1]]; XMX(i,p7G_B) = MMX(i+1,1) + TSC(p7P_BM,0) + MSC(1); /* t_BM index is 0 because it's stored off-by-one. */ for (k = 2; k <= M; k++) XMX(i,p7G_B) = p7_FLogsum(XMX(i, p7G_B), MMX(i+1,k) + TSC(p7P_BM,k-1) + MSC(k)); XMX(i,p7G_J) = p7_FLogsum( XMX(i+1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP], XMX(i, p7G_B) + gm->xsc[p7P_J][p7P_MOVE]); XMX(i,p7G_C) = XMX(i+1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_E) = p7_FLogsum( XMX(i, p7G_J) + gm->xsc[p7P_E][p7P_LOOP], XMX(i, p7G_C) + gm->xsc[p7P_E][p7P_MOVE]); XMX(i,p7G_N) = p7_FLogsum( XMX(i+1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP], XMX(i, p7G_B) + gm->xsc[p7P_N][p7P_MOVE]); MMX(i,M) = DMX(i,M) = XMX(i,p7G_E); IMX(i,M) = -eslINFINITY; for (k = M-1; k >= 1; k--) { MMX(i,k) = p7_FLogsum( p7_FLogsum(MMX(i+1,k+1) + TSC(p7P_MM,k) + MSC(k+1), IMX(i+1,k) + TSC(p7P_MI,k) + ISC(k)), p7_FLogsum(XMX(i,p7G_E) + esc, DMX(i, k+1) + TSC(p7P_MD,k))); IMX(i,k) = p7_FLogsum( MMX(i+1,k+1) + TSC(p7P_IM,k) + MSC(k+1), IMX(i+1,k) + TSC(p7P_II,k) + ISC(k)); DMX(i,k) = p7_FLogsum( MMX(i+1,k+1) + TSC(p7P_DM,k) + MSC(k+1), p7_FLogsum( DMX(i, k+1) + TSC(p7P_DD,k), XMX(i, p7G_E) + esc)); } } /* At i=0, only N,B states are reachable. */ rsc = gm->rsc[dsq[1]]; XMX(0,p7G_B) = MMX(1,1) + TSC(p7P_BM,0) + MSC(1); /* t_BM index is 0 because it's stored off-by-one. */ for (k = 2; k <= M; k++) XMX(0,p7G_B) = p7_FLogsum(XMX(0, p7G_B), MMX(1,k) + TSC(p7P_BM,k-1) + MSC(k)); XMX(i,p7G_J) = -eslINFINITY; XMX(i,p7G_C) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; XMX(i,p7G_N) = p7_FLogsum( XMX(1, p7G_N) + gm->xsc[p7P_N][p7P_LOOP], XMX(0, p7G_B) + gm->xsc[p7P_N][p7P_MOVE]); for (k = M; k >= 1; k--) MMX(i,M) = IMX(i,M) = DMX(i,M) = -eslINFINITY; if (opt_sc != NULL) *opt_sc = XMX(0,p7G_N); gx->M = M; gx->L = L; return eslOK; }