int ILogsum(int s1, int s2) { const int max = ESL_MAX(-INFTY, ESL_MAX(s1, s2)); const int min = ESL_MIN(s1, s2); return (min <= -INFTY || (max-min) >= LOGSUM_TBL) ? max : max + ilogsum_lookup[max-min]; }
int esl_hmm_Forward(const ESL_DSQ *dsq, int L, const ESL_HMM *hmm, ESL_HMX *fwd, float *opt_sc) { int i, k, m; int M = hmm->M; float logsc = 0; float max; fwd->sc[0] = 0.0; if (L == 0) { fwd->sc[L+1] = logsc = log(hmm->pi[M]); if (opt_sc != NULL) *opt_sc = logsc; return eslOK; } max = 0.0; for (k = 0; k < M; k++) { fwd->dp[1][k] = hmm->eo[dsq[1]][k] * hmm->pi[k]; max = ESL_MAX(fwd->dp[1][k], max); } for (k = 0; k < M; k++) { fwd->dp[1][k] /= max; } fwd->sc[1] = log(max); for (i = 2; i <= L; i++) { max = 0.0; for (k = 0; k < M; k++) { fwd->dp[i][k] = 0.0; for (m = 0; m < M; m++) fwd->dp[i][k] += fwd->dp[i-1][m] * hmm->t[m][k]; fwd->dp[i][k] *= hmm->eo[dsq[i]][k]; max = ESL_MAX(fwd->dp[i][k], max); } for (k = 0; k < M; k++) fwd->dp[i][k] /= max; fwd->sc[i] = log(max); } fwd->sc[L+1] = 0.0; for (m = 0; m < M; m++) fwd->sc[L+1] += fwd->dp[L][m] * hmm->t[m][M]; fwd->sc[L+1] = log(fwd->sc[L+1]); logsc = 0.0; for (i = 1; i <= L+1; i++) logsc += fwd->sc[i]; fwd->M = hmm->M; fwd->L = L; if (opt_sc != NULL) *opt_sc = logsc; return eslOK; }
int esl_hmm_Backward(const ESL_DSQ *dsq, int L, const ESL_HMM *hmm, ESL_HMX *bck, float *opt_sc) { int i,k,m; int M = hmm->M; float logsc = 0.0; float max; bck->sc[L+1] = 0.0; if (L == 0) { bck->sc[0] = logsc = log(hmm->pi[M]); if (opt_sc != NULL) *opt_sc = logsc; return eslOK; } max = 0.0; for (k = 0; k < M; k++) { bck->dp[L][k] = hmm->t[k][M]; max = ESL_MAX(bck->dp[L][k], max); } for (k = 0; k < M; k++) bck->dp[L][k] /= max; bck->sc[L] = log(max); for (i = L-1; i >= 1; i--) { max = 0.0; for (k = 0; k < M; k++) { bck->dp[i][k] = 0.0; for (m = 0; m < M; m++) bck->dp[i][k] += bck->dp[i+1][m] * hmm->eo[dsq[i+1]][m] * hmm->t[k][m]; max = ESL_MAX(bck->dp[i][k], max); } for (k = 0; k < M; k++) bck->dp[i][k] /= max; bck->sc[i] = log(max); } bck->sc[0] = 0.0; for (m = 0; m < M; m++) bck->sc[0] += bck->dp[1][m] * hmm->eo[dsq[1]][m] * hmm->pi[m]; bck->sc[0] = log(bck->sc[0]); logsc = 0.0; for (i = 0; i <= L; i++) logsc += bck->sc[i]; bck->M = hmm->M; bck->L = L; if (opt_sc != NULL) *opt_sc = logsc; return eslOK; }
/* Function: esl_recorder_Read() * Synopsis: Read next line of a stream through an <ESL_RECORDER>. * Incept: SRE, Fri Dec 25 16:31:00 2009 [Casa de Gatos] * * Purpose: Read the next line of the input stream that the * <ESL_RECORDER> <rc> is recording. Return a ptr to * it in <*opt_line>. Note that the <ESL_RECORDER> * deals with allocation and freeing of this line; * if caller wants to keep it for something, it must * make a copy immediately, because subsequent calls * to <esl_recorder_*> functions may overwrite these * internal memory buffers. * * Returns: <eslOK> on success. * <eslEOF> if no more lines exist in the stream. * * Throws: <eslEMEM> on an allocation failure. */ int esl_recorder_Read(ESL_RECORDER *rc, char **opt_line) { int idx = (rc->ncurr - rc->baseline) % rc->nalloc; /* index of line to read, in wrapped coords */ int status; /* if currline <= lastline, we already have the line recorded; * else we need to read a new one from <fp> */ if (rc->ncurr >= rc->nread) { /* if reading a new line would overwrite our marked start, grow */ if ( rc->markline >= 0 && ((rc->ncurr - rc->baseline) % rc->nalloc == ((rc->markline - rc->baseline) % rc->nalloc))) { int xtra = ESL_MAX(3, (rc->nalloc / 3)); status = esl_recorder_ResizeTo(rc, rc->nalloc + xtra); if (status) goto ERROR; idx = (rc->ncurr - rc->baseline) % rc->nalloc; } rc->offset[idx] = ftello(rc->fp); status = esl_fgets(&(rc->line[idx]), &(rc->lalloc[idx]), rc->fp); if (status) goto ERROR; rc->nread++; } rc->ncurr++; if (opt_line) *opt_line = rc->line[idx]; return eslOK; ERROR: if (opt_line) *opt_line = NULL; return status; }
/* Function: p7_GHybrid() * Synopsis: The "hybrid" algorithm. * Incept: SRE, Sat May 19 10:01:46 2007 [Janelia] * * Purpose: The profile HMM version of the Hwa "hybrid" alignment * algorithm \citep{YuHwa02}. The "hybrid" score is the * maximum score in the Forward matrix. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <mx> allocated for at least <gm->M> * by <L> cells; calculate the probability of the sequence * given the model using the Forward algorithm; return * the calculated Forward matrix in <mx>, and optionally * return the Forward score in <opt_fwdscore> and/or the * Hybrid score in <opt_hybscore>. * * This is implemented as a wrapper around <p7_GForward()>. * The Forward matrix and the Forward score obtained from * this routine are identical to what <p7_GForward()> would * return. * * The scores are returned in lod form. To convert to a * bitscore, the caller needs to subtract a null model lod * score, then convert to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * opt_fwdscore - optRETURN: Forward lod score in nats. * opt_hybscore - optRETURN: Hybrid lod score in nats. * * Returns: <eslOK> on success, and results are in <mx>, <opt_fwdscore>, * and <opt_hybscore>. */ int p7_GHybrid(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_fwdscore, float *opt_hybscore) { float F = -eslINFINITY; float H = -eslINFINITY; float **dp = gx->dp; int i,k; int status; if ((status = p7_GForward(dsq, L, gm, gx, &F)) != eslOK) goto ERROR; for (i = 1; i <= L; i++) for (k = 1 ; k <= gm->M; k++) H = ESL_MAX(H, MMX(i,k)); gx->M = gm->M; gx->L = L; if (opt_fwdscore != NULL) *opt_fwdscore = F; if (opt_hybscore != NULL) *opt_hybscore = H; return eslOK; ERROR: if (opt_fwdscore != NULL) *opt_fwdscore = 0; if (opt_hybscore != NULL) *opt_hybscore = 0; return status; }
int p7_ViterbiCOPSw_run(DATA_COPS16* dcops, SEQ **seqsdb, float* results) { int j, maxL = 0; for (j = 0; j < SSE16_NVALS; j++) if (seqsdb[j]->length > maxL) maxL = seqsdb[j]->length; if (maxL > dcops->allocL-10) { // realloc dcops->allocL = ESL_MAX(dcops->allocL*2, roundtop(maxL,1024)); free_cops_buffers(dcops); alloc_cops_buffers(dcops, dcops->allocL); } if (dcops->L != maxL) { dcops->L = maxL; p7_ReconfigLength(dcops->gm, maxL); } for (j = 0; j < SSE16_NVALS; j++) // must be copied since we run N seqs in parallel up to the max length of the longest memcpy(dcops->seqs[j], seqsdb[j]->seq, (seqsdb[j]->length+1)*sizeof(ESL_DSQ)); // pad sequences with k = Alphasize byte alphSize = dcops->gm->abc->Kp; for (j = 0; j < SSE16_NVALS; j++) memset(dcops->seqs[j]+seqsdb[j]->length+1, alphSize, maxL-seqsdb[j]->length); for (j = 1; j <= dcops->gm->M/dcops->partition; j++) memset((void*) dcops->synchflags[j], 0, dcops->nflags); dcops->synccontrol++; p7_ViterbiCOPSw_partitioned_threaded(dcops, results, NTHREADS-1 ); return maxL; }
/* Open the source sequence database for negative subseqs; * upon return, cfg->dbfp is open (digital, SSI indexed); * cfg->db_maxL and cfg->db_nseq are set. */ static int process_dbfile(struct cfg_s *cfg, char *dbfile, int dbfmt) { ESL_SQ *sq = esl_sq_CreateDigital(cfg->abc); int status; /* Open the sequence file in digital mode */ status = esl_sqfile_OpenDigital(cfg->abc, dbfile, dbfmt, NULL, &(cfg->dbfp)); if (status == eslENOTFOUND) esl_fatal("No such file %s", dbfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", dbfile); else if (status == eslEINVAL) esl_fatal("Can't autodetect stdin or .gz."); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); /* Read info on each sequence */ cfg->db_nseq = 0; cfg->db_maxL = 0; while ((status = esl_sqio_ReadInfo(cfg->dbfp, sq)) == eslOK) { cfg->db_maxL = ESL_MAX(sq->L, cfg->db_maxL); cfg->db_nseq++; esl_sq_Reuse(sq); } if (status != eslEOF) esl_fatal("Something went wrong with reading the seq db"); /* Open SSI index */ if (esl_sqfile_OpenSSI(cfg->dbfp, NULL) != eslOK) esl_fatal("Failed to open SSI index file"); if (cfg->dbfp->data.ascii.ssi->nprimary != cfg->db_nseq) esl_fatal("oops, nprimary != nseq"); esl_sq_Destroy(sq); return eslOK; }
/* Function: esl_recorder_ResizeTo() * Synopsis: Reallocate an <ESL_RECORDER> for a new <maxlines> * Incept: SRE, Fri Dec 25 17:02:46 2009 [Casa de Gatos] * * Purpose: Reallocate the <ESL_RECORDER> <rc> to have a new * window size <maxlines>. * * The new <maxlines> may be more or less than the previous * window size for <rc>. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> if (re-)allocation fails. * * <eslEINVAL> if the recorder has a marked line (for start * of a block) and you try to shrink it so much that that * marked line would be lost. * * <eslEINCONCEIVABLE> on any baseline resetting problem; * this would have to be an internal error in the module. * * Note: We may have to repermute the line array, and reset its * baseline, as follows. * * In the growth case: if the line array is out of order * (circularly permuted) we must straighten it out, which * means resetting the baseline. * i.e. to grow 3 1 2 to nalloc=6, we need 1 2 3 x x x; * simple reallocation to 3 1 2 x x x doesn't work, * next read would make 3 4 2 x x x. * * In the shrinkage case: if the line array is in use beyond the * new array size, we set a new baseline to keep as much of the * old array as possible. * * i.e. for 6->3 * 1 2 3 x x x -> 1 2 3 * 1 2 3 4 x x -> 2 3 4 with new baseline=2. * 4 5 0 1 2 3 -> 3 4 5 with new baseline=3 */ int esl_recorder_ResizeTo(ESL_RECORDER *rc, int new_maxlines) { int idx; int newbase; void *tmp; int minlines; int status; if (new_maxlines == rc->nalloc) return eslOK; if (new_maxlines > rc->nalloc) /* growth case */ { if ((rc->nread - rc->baseline) / rc->nalloc != 0) /* array is permuted; reorder it */ { newbase = ESL_MAX(rc->baseline, rc->nread - rc->nalloc); status = recorder_new_baseline(rc, newbase); if (status) ESL_EXCEPTION(eslEINCONCEIVABLE, "baseline reset failed unexpectedly"); } } else /* shrinkage case */ { /* check that the marked line (if any) will stay in window */ if (rc->markline >= 0) { minlines = rc->nread - rc->markline; if (new_maxlines < minlines) ESL_EXCEPTION(eslEINVAL, "can't shrink that far without losing marked line"); } /* check that current line will stay in window */ minlines = rc->nread - rc->ncurr + 1; if (new_maxlines < minlines) ESL_EXCEPTION(eslEINVAL, "can't shrink that far without losing current line"); if (rc->nread - rc->baseline > new_maxlines) /* baseline needs to move up */ { newbase = rc->nread - new_maxlines; status = recorder_new_baseline(rc, newbase); if (status) ESL_EXCEPTION(eslEINCONCEIVABLE, "baseline reset failed unexpectedly"); } for (idx = new_maxlines; idx < rc->nalloc; idx++) if (rc->line[idx]) free(rc->line[idx]); } ESL_RALLOC(rc->line, tmp, sizeof(char *) * new_maxlines); ESL_RALLOC(rc->lalloc, tmp, sizeof(int) * new_maxlines); ESL_RALLOC(rc->offset, tmp, sizeof(off_t) * new_maxlines); for (idx = rc->nalloc; idx < new_maxlines; idx++) /* no-op in shrinkage case */ { rc->line[idx] = NULL; rc->lalloc[idx] = 0; rc->offset[idx] = 0; } rc->nalloc = new_maxlines; return eslOK; ERROR: return status; }
float LogSum2(float s1, float s2) { const float max = ESL_MAX(s1, s2); const float min = ESL_MIN(s1, s2); return (min == -eslINFINITY || (max-min) >= 23.f) ? max : max + flogsum_lookup[(int)((max-min)*INTSCALE)]; }
/* Function: p7_tophits_GetMaxPositionLength() * Synopsis: Returns maximum position length in hit list (targets). * * Purpose: Returns the length of the longest hit location (start/end) * of all the registered hits, in chars. This is useful when * deciding how to format output. * * The maximum is taken over all registered hits. This * opens a possible side effect: caller might print only * the top hits, and the max name length in these top hits * may be different than the max length over all the hits. * * Used specifically for nhmmer output, so expects only one * domain per hit * * If there are no hits in <h>, or none of the * hits have names, returns 0. */ int p7_tophits_GetMaxPositionLength(P7_TOPHITS *h) { int i, max, n; char buffer [11]; for (max = 0, i = 0; i < h->N; i++) { if (h->unsrt[i].dcl[0].ia > 0) { n = sprintf (buffer, "%d", h->unsrt[i].dcl[0].ia); max = ESL_MAX(n, max); n = sprintf (buffer, "%d", h->unsrt[i].dcl[0].ib); max = ESL_MAX(n, max); } } return max; }
/* tests: * 1. each sampled trace must validate. * 2. each trace must be <= viterbi trace score * 3. in a large # of traces, one is "equal" to the viterbi trace score. * (this of course is stochastic; but it's true for the particular * choice of RNG seed used in tests here.) */ static void utest_stotrace(ESL_GETOPTS *go, ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, P7_PROFILE *gm, P7_OPROFILE *om, ESL_DSQ *dsq, int L, int ntrace) { P7_GMX *gx = NULL; P7_OMX *ox = NULL; P7_TRACE *tr = NULL; char errbuf[eslERRBUFSIZE]; int idx; float maxsc = -eslINFINITY; float vsc, sc; if ((gx = p7_gmx_Create(gm->M, L)) == NULL) esl_fatal("generic DP matrix creation failed"); if ((ox = p7_omx_Create(gm->M, L, L)) == NULL) esl_fatal("optimized DP matrix create failed"); if ((tr = p7_trace_Create()) == NULL) esl_fatal("trace creation failed"); if (p7_GViterbi(dsq, L, gm, gx, &vsc) != eslOK) esl_fatal("viterbi failed"); if (p7_Forward (dsq, L, om, ox, NULL) != eslOK) esl_fatal("forward failed"); for (idx = 0; idx < ntrace; idx++) { if (p7_StochasticTrace(rng, dsq, L, om, ox, tr) != eslOK) esl_fatal("stochastic trace failed"); if (p7_trace_Validate(tr, abc, dsq, errbuf) != eslOK) esl_fatal("trace invalid:\n%s", errbuf); if (p7_trace_Score(tr, dsq, gm, &sc) != eslOK) esl_fatal("trace scoring failed"); maxsc = ESL_MAX(sc, maxsc); if (sc > vsc) esl_fatal("sampled trace has score > optimal Viterbi path; not possible"); p7_trace_Reuse(tr); } if (esl_FCompare(maxsc, vsc, 0.1) != eslOK) esl_fatal("stochastic trace failed to sample the Viterbi path"); p7_trace_Destroy(tr); p7_omx_Destroy(ox); p7_gmx_Destroy(gx); }
/* Function: p7_oprofile_GetSSVEmissionScoreArray() * Synopsis: Retrieve MSV residue emission scores from a * profile into an array * * Purpose: Extract an implicitly 2D array of 8-bit int MSV residue * emission scores from a profile <om>. <arr> must * be allocated by the calling function to be of size * ( om->abc->Kp * ( om->M + 1 )), and indexing into the array * is done as [om->abc->Kp * i + c ] for character c at * position i. * * In the dummy implementation, we need to convert from the * float emission probabilities to 8-bit int scores. Conversion * is based on code from the function mf_conversion in impl_sse's * p7_oprofile.c * * Args: <om> - profile, containing emission information * <arr> - preallocated array into which scores will be placed * * Returns: <eslOK> on success. * * Throws: (no abnormal error conditions) */ int p7_oprofile_GetSSVEmissionScoreArray(const P7_OPROFILE *om, uint8_t *arr ) { int M = om->M; /* length of the query */ int i, j; float x; float max = 0.0; float scale; uint8_t bias; /* scale and bias required for float->8bit conversion */ scale = 3.0 / eslCONST_LOG2; /* scores in units of third-bits */ for (i = 0; i < om->abc->K; i++) max = ESL_MAX(max, esl_vec_FMax(om->rsc[i], (M+1)*2)); max = -1.0f * roundf(scale * -1.0 * max); //based on unbiased_byteify bias = (max > 255.) ? 255 : (uint8_t) max; for (i = 1; i <= om->M; i++) { for (j=0; j<om->abc->Kp; j++) { //based on p7_oprofile's biased_byteify() x = -1.0f * roundf(scale * om->rsc[j][(i) * p7P_NR + p7P_MSC]); arr[i*om->abc->Kp + j] = (x > 255. - bias) ? 255 : (uint8_t) (x + bias); } } return eslOK; }
/* Function: esl_recorder_MarkBlock() * Synopsis: Mark first line to be saved in a block. * Incept: SRE, Fri Jan 1 11:13:53 2010 [Magallon] * * Purpose: Mark line number <markline> (0..N-1) in a file being read * through the <ESL_RECORDER> <rc> as the first line in a * block of lines to be parsed later, when the end of * the block is found. * * This mark makes sure that the <ESL_RECORDER> will keep * the entire block of lines in memory, starting at or * before the mark. When a mark is active, * <esl_recorder_Read()> will reallocate and grow the * recorder as necessary, rather than overwriting the mark. * * Returns: <eslOK> on success. * * Throws: <eslEINVAL> if the <markline> has already passed out * of the recorder's memory. */ int esl_recorder_MarkBlock(ESL_RECORDER *rc, int markline) { int line0 = ESL_MAX(rc->baseline, rc->nread - rc->nalloc); if (markline < line0) ESL_EXCEPTION(eslEINVAL, "recorder window already passed marked line"); rc->markline = markline; return eslOK; }
/* Function: p7_tophits_GetMaxShownLength() * Synopsis: Returns max shown name/accession length in hit list. * * Purpose: Same as <p7_tophits_GetMaxNameLength()>, but * for the case when --acc is on, where * we show accession if one is available, and * fall back to showing the name if it is not. * Returns the max length of whatever is being * shown as the reported "name". */ int p7_tophits_GetMaxShownLength(P7_TOPHITS *h) { int i, max, n; for (max = 0, i = 0; i < h->N; i++) { if (h->unsrt[i].acc != NULL && h->unsrt[i].acc[0] != '\0') { n = strlen(h->unsrt[i].acc); max = ESL_MAX(n, max); } else if (h->unsrt[i].name != NULL) { n = strlen(h->unsrt[i].name); max = ESL_MAX(n, max); } } return max; }
/* yes LogSum2 and FLogsum are identical, this is for backwards compatibility */ float FLogsum(float s1, float s2) { const float max = ESL_MAX(s1, s2); const float min = ESL_MIN(s1, s2); #if 0 return (min == -eslINFINITY || (max-min) >= 23.f) ? max : max + sreLOG2(1.0 + sreEXP2(min-max)); /* EPN: While debugging. Replaces logsum table with analytical calculation. Remember to remove! */ #endif return (min == -eslINFINITY || (max-min) >= 23.f) ? max : max + flogsum_lookup[(int)((max-min)*INTSCALE)]; }
/* Function: p7_tophits_GetMaxAccessionLength() * Synopsis: Returns maximum accession length in hit list (targets). * * Purpose: Same as <p7_tophits_GetMaxNameLength()>, but for * accessions. If there are no hits in <h>, or none * of the hits have accessions, returns 0. */ int p7_tophits_GetMaxAccessionLength(P7_TOPHITS *h) { int i, max, n; for (max = 0, i = 0; i < h->N; i++) if (h->unsrt[i].acc != NULL) { n = strlen(h->unsrt[i].acc); max = ESL_MAX(n, max); } return max; }
/* Function: p7_GMSV() * Synopsis: The MSV score algorithm (slow, correct version) * Incept: SRE, Thu Dec 27 08:33:39 2007 [Janelia] * * Purpose: Calculates the maximal score of ungapped local segment * pair alignments, taking advantage of the fact that this * is simply equivalent to setting all MM transitions to 1.0 * in a multihit local profile. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile (can be in any mode) * gx - DP matrix with room for an MxL alignment * nu - configuration: expected number of hits (use 2.0 as a default) * opt_sc - optRETURN: MSV lod score in nats. * * Returns: <eslOK> on success. * * Note: This is written deliberately as a modified p7_GViterbi * routine. It could be faster -- we don't need the * interleaved dp matrix or residue scores, since we aren't * calculating D or I states, for example, and we could do * without some of the special states -- but speed is the * job of the optimized implementations. Rather, the goal * here is to establish a stable, probabilistically correct * reference calculation. (Thus, the CC, NN, JJ transitions * are real scores here, not fixed to 0 as in the optimized * versions.) */ int p7_GMSV(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float nu, float *opt_sc) { float **dp = gx->dp; float *xmx = gx->xmx; float tloop = logf((float) L / (float) (L+3)); float tmove = logf( 3.0f / (float) (L+3)); float tbmk = logf( 2.0f / ((float) gm->M * (float) (gm->M+1))); float tej = logf((nu - 1.0f) / nu); float tec = logf(1.0f / nu); int i,k; XMX(0,p7G_N) = 0; XMX(0,p7G_B) = tmove; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) =-eslINFINITY; /* need seq to get here */ for (k = 0; k <= gm->M; k++) MMX(0,k) = -eslINFINITY; /* need seq to get here */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; MMX(i,0) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k <= gm->M; k++) { MMX(i,k) = MSC(k) + ESL_MAX(MMX(i-1,k-1), XMX(i-1,p7G_B) + tbmk); XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k)); } XMX(i,p7G_J) = ESL_MAX( XMX(i-1,p7G_J) + tloop, XMX(i, p7G_E) + tej); XMX(i,p7G_C) = ESL_MAX( XMX(i-1,p7G_C) + tloop, XMX(i, p7G_E) + tec); XMX(i,p7G_N) = XMX(i-1,p7G_N) + tloop; XMX(i,p7G_B) = ESL_MAX( XMX(i, p7G_N) + tmove, XMX(i, p7G_J) + tmove); } gx->M = gm->M; gx->L = L; if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + tmove; return eslOK; }
float p7_masstrace_GetMaxAbsDiff(const P7_MASSTRACE *mte, const P7_MASSTRACE *mta) { int i,k; float diff; float max = 0.; if (mte->imass && mta->imass) { for (i = 1; i <= mte->L; i++) { diff = fabs(mte->imass[i] - mta->imass[i]); max = ESL_MAX(diff, max); } } for (k = 1; k <= mte->M; k++) { diff = fabs(mte->kmass[k] - mta->kmass[k]); max = ESL_MAX(diff, max); } return max; }
/* is_multidomain_region() * SRE, Fri Feb 8 11:35:04 2008 [Janelia] * * This defines the trigger for when we need to hand a "region" off to * a deeper analysis (using stochastic tracebacks and clustering) * because there's reason to suspect it may encompass two or more * domains. * * The criterion is to find the split point z at which the expected * number of E occurrences preceding B occurrences is maximized, and * if that number is greater than the heuristic threshold <ddef->rt3>, * then return TRUE. In other words, we're checking to see if there's * any point in the region at which it looks like an E was followed by * a B, as expected for a multidomain interpretation of the region. * * More precisely: return TRUE if \max_z [ \min (B(z), E(z)) ] >= rt3 * where * E(z) = expected number of E states occurring in region before z is emitted * = \sum_{y=i}^{z} eocc[i] = etot[z] - etot[i-1] * B(z) = expected number of B states occurring in region after z is emitted * = \sum_{y=z}^{j} bocc[i] = btot[j] - btot[z-1] * * * Because this relies on the <ddef->etot> and <ddef->btot> arrays, * <calculate_domain_posteriors()> needs to have been called first. * * Xref: J2/101. */ static int is_multidomain_region(P7_DOMAINDEF *ddef, int i, int j) { int z; float max; float expected_n; max = -1.0; for (z = i; z <= j; z++) { expected_n = ESL_MIN( (ddef->etot[z] - ddef->etot[i-1]), (ddef->btot[j] - ddef->btot[z-1])); max = ESL_MAX(max, expected_n); } return ( (max >= ddef->rt3) ? TRUE : FALSE); }
int p7_ViterbiStream(DATA_STREAM* dstream, dsq_cmp_t **seqsdb, float* results) { int j, maxL = 0; #if 0 for (j = 0; j < 8; j++) if (seqsdb[j]->length > maxL) maxL = seqsdb[j]->length; // maxL = 2000; // for (j = 0; j < 8; j++) printf("%d ", seqsdb[j]->length); printf("\n"); if (maxL > dstream->allocL-10) { // realloc dstream->allocL = ESL_MAX(dstream->allocL*2, roundtop(maxL,1024)); free_stream_buffers(dstream); alloc_stream_buffers(dstream, dstream->allocL); } if (dstream->L != maxL) { dstream->L = maxL; p7_ReconfigLength(dstream->gm, maxL); } for (j = 0; j < 8; j++) memcpy(dstream->seqs[j], seqsdb[j]->seq, (seqsdb[j]->length+1)*sizeof(ESL_DSQ)); // pad sequences with k = Alphasize for (j = 0; j < 8; j++) memset(dstream->seqs[j]+seqsdb[j]->length+1, dstream->gm->abc->Kp, maxL-seqsdb[j]->length); for (j = 1; j <= dstream->gm->M/dstream->partition; j++) memset((void*) dstream->synchflags[j], 0, dstream->nflags); for (j = 1; j <= dstream->gm->M/dstream->partition; j++) memset((void*) dstream->synchflags[j], 0, dstream->nflags); #endif if(1) for (j = 0; j < NTHREADS-1; j++) { sem_post(&semsynch[j]); syncflags[j] = 1; } dstream->synccontrol++; viterbi_stream_word_partitioned(dstream, results, NTHREADS-1 ); return maxL; }
/* Function: esl_recorder_Position() * Synopsis: Reset the recorder to a new starting line position. * Incept: SRE, Mon Dec 28 10:25:22 2009 [Casa de Gatos] * * Purpose: Reset the recorder <rc> to a new line position <linenumber>, * starting from 0. The next call to <esl_recorder_Read()> * will read this line. * * The <linenumber> can be ahead of the furthest line read * by the recorder so far, in which case it calls * <esl_recorder_Read()> until it reaches the proper * position. This can result in a return code of <eslEOF>, * if no such line exists in the stream. * * If the <linenumber> falls before (outside) the * recorder's history window, an <eslEINVAL> exception is * thrown. * * Returns: <eslOK> on success. * <eslEOF> if <linenumber> is larger than current position * in file, and the stream ends before line <linenumber> is * reached. * * Throws: <eslEMEM> on allocation failure; this can only happen * if <linenumber> is larger than current position in * file, forcing <esl_recorder_Read()> calls to reach that * line. */ int esl_recorder_Position(ESL_RECORDER *rc, int linenumber) { /* The recorder stores lines MAX(baseline,<nread-nalloc>)..<nread>-1 */ int line0 = ESL_MAX(rc->baseline, rc->nread - rc->nalloc); int status; if (linenumber < line0) ESL_EXCEPTION(eslEINVAL, "recorder's window is past that line"); if (linenumber >= rc->nread) { while (rc->nread < linenumber) if ((status = esl_recorder_Read(rc, NULL)) != eslOK) return status; } rc->ncurr = linenumber; return eslOK; }
/* guaranteed s1 >= -INFTY, s2 >= -INFTY */ int ILogsumNI(int s1, int s2) { ESL_DASSERT1((s1 > -INFTY)); ESL_DASSERT1((s2 > -INFTY)); /*assert(s1 > -INFTY); assert(s2 > -INFTY);*/ const int max = ESL_MAX(s1, s2); const int min = ESL_MIN(s1, s2); return ((max-min) >= LOGSUM_TBL) ? max : max + ilogsum_lookup[max-min]; /* about 10% slower if(s1 > s2) return ((s1-s2) >= LOGSUM_TBL) ? s1 : s1 + ilogsum_lookup[s1-s2]; else return ((s2-s1) >= LOGSUM_TBL) ? s2 : s2 + ilogsum_lookup[s2-s1]; */ }
/* Function: DispatchSqBlockAlignment() * Date: EPN, Fri Dec 30 14:59:43 2011 * * Purpose: Given a CM and a block of sequences, align the * sequence(s) using the appropriate alignment function and * return relevant data for eventual output in <ret_dataA>. * This function simply calls DispatchSqAlignment() serially * for each sequence in the block, and creates an array * of the <ret_data> DispatchSqAlignment() returns. * * Currently <mode>, <cp9b_valid> and <pass_idx> values sent * to DispatchSqAlignment() are hard-coded to * TRMODE_UNKNOWN, FALSE, and PLI_PASS_5P_AND_3P_FORCE (if * cm->align_opts & CM_ALIGN_TRUNC) or PLI_PASS_STD_ANY (if * (! cm->align_opts & CM_ALIGN_TRUNC)). This is because * this function is only used by the alignment pipeline, in * which these values are correct. If this changes, we may * want caller to pass in an array of modes, cp9b_valids and * pass_idx values, one per sq. * * If (cm->flags & CM_ALIGN_XTAU) we'll potentially tighten * HMM bands until the required DP matrices are below out * limit (<mxsize>). cm->maxtau is the max allowed tau value * during this iterative band tightening, and cm->xtau is * the factor by which we multiply cm->tau at each iteration * during band tightening. * * Args: cm - the covariance model * errbuf - char buffer for reporting errors * sq_block - block of sequences to align * mxsize - max size in Mb of allowable DP mx * w - stopwatch for timing individual stages * w_tot - stopwatch for timing total time per seq * r - RNG, req'd if CM_ALIGN_SAMPLE, can be NULL otherwise * ret_dataA - RETURN: newly created array of CM_ALNDATA objects * * Returns: eslOK on success; * eslEINCOMPAT on contract violation, errbuf is filled; * eslEMEM if we run out of memory; * <ret_dataA> is alloc'ed and filled with sq_block->count CM_ALNDATA objects. */ int DispatchSqBlockAlignment(CM_t *cm, char *errbuf, ESL_SQ_BLOCK *sq_block, float mxsize, ESL_STOPWATCH *w, ESL_STOPWATCH *w_tot, ESL_RANDOMNESS *r, CM_ALNDATA ***ret_dataA) { int status; /* easel status */ int j; /* counter over parsetrees */ CM_ALNDATA **dataA = NULL; /* CM_ALNDATA array we'll create and return */ ESL_SQ *sqp; /* ptr to a ESL_SQ */ int pass_idx; /* pass_idx passed to DispatchSqAlignment() */ char mode; /* mode passed to DispatchSqAlignment() */ int cp9b_valid; /* passed to DispatchSqAlignment() */ ESL_ALLOC(dataA, sizeof(CM_ALNDATA *) * ESL_MAX(1, sq_block->count)); // avoid 0 malloc for(j = 0; j < sq_block->count; j++) dataA[j] = NULL; /* DispatchSqAligment() needs a mode, pipeline pass index, and * knowledge of whether cm->cp9b are valid for sequence to align * (see note in 'Purpose' above). Currently the relevant values * for these are as follows: */ mode = TRMODE_UNKNOWN; pass_idx = (cm->align_opts & CM_ALIGN_TRUNC) ? PLI_PASS_5P_AND_3P_FORCE : PLI_PASS_STD_ANY; cp9b_valid = FALSE; /* main loop: for each sequence, call DispatchSqAlignment() to do the work */ for(j = 0; j < sq_block->count; j++) { sqp = sq_block->list + j; if((status = DispatchSqAlignment(cm, errbuf, sqp, sq_block->first_seqidx + j, mxsize, mode, pass_idx, cp9b_valid, w, w_tot, r, &(dataA[j]))) != eslOK) goto ERROR; } *ret_dataA = dataA; return eslOK; ERROR: if(dataA != NULL) { for(j = 0; j < sq_block->count; j++) { if(dataA[j] != NULL) cm_alndata_Destroy(dataA[j], FALSE); } free(dataA); } *ret_dataA = NULL; if(status == eslEMEM) ESL_FAIL(status, errbuf, "DispatchSqBlockAlignment(), out of memory"); else return status; /* errbuf was filled by DispatchSqAlignment() */ }
/* the pack send/recv buffer must be big enough to hold either an error message or a result vector. * it may even grow larger than that, to hold largest HMM we send. */ static int minimum_mpi_working_buffer(ESL_GETOPTS *go, int N, int *ret_wn) { int n; int nerr = 0; int nresult = 0; /* error packet */ if (MPI_Pack_size(eslERRBUFSIZE, MPI_CHAR, MPI_COMM_WORLD, &nerr)!= 0)return eslESYS; /* results packet */ if (MPI_Pack_size(N, MPI_DOUBLE, MPI_COMM_WORLD, &n) != 0) return eslESYS; nresult += n; /* scores */ if (esl_opt_GetBoolean(go, "-a")) { if (MPI_Pack_size(N, MPI_INT, MPI_COMM_WORLD, &n) != 0) return eslESYS; nresult += n; /* alignment lengths */ } if (MPI_Pack_size(1, MPI_DOUBLE, MPI_COMM_WORLD, &n) != 0) return eslESYS; nresult += n*2; /* mu, lambda */ /* add the shared status code to the max of the two possible kinds of packets */ *ret_wn = ESL_MAX(nresult, nerr); if (MPI_Pack_size(1, MPI_INT, MPI_COMM_WORLD, &n) != 0) return eslESYS; *ret_wn += n; /* status code */ return eslOK; }
/* set_effective_seqnumber() * * <hmm> comes in with weighted observed counts. It goes out with * those observed counts rescaled to sum to the "effective sequence * number". * * <msa> is needed because we may need to see the sequences in order * to determine effective seq #. (for --eclust) * * <prior> is needed because we may need to parameterize test models * looking for the right relative entropy. (for --eent, the default) */ static int effective_seqnumber(P7_BUILDER *bld, const ESL_MSA *msa, P7_HMM *hmm, const P7_BG *bg) { int status; if (bld->effn_strategy == p7_EFFN_NONE) hmm->eff_nseq = msa->nseq; else if (bld->effn_strategy == p7_EFFN_SET) hmm->eff_nseq = bld->eset; else if (bld->effn_strategy == p7_EFFN_CLUST) { int nclust; status = esl_msacluster_SingleLinkage(msa, bld->eid, NULL, NULL, &nclust); if (status == eslEMEM) ESL_XFAIL(status, bld->errbuf, "memory allocation failed"); else if (status != eslOK) ESL_XFAIL(status, bld->errbuf, "single linkage clustering algorithm (at %d%% id) failed", (int)(100 * bld->eid)); hmm->eff_nseq = (double) nclust; } else if (bld->effn_strategy == p7_EFFN_ENTROPY) { double etarget; double eff_nseq; etarget = (bld->esigma - eslCONST_LOG2R * log( 2.0 / ((double) hmm->M * (double) (hmm->M+1)))) / (double) hmm->M; /* xref J5/36. */ etarget = ESL_MAX(bld->re_target, etarget); status = p7_EntropyWeight(hmm, bg, bld->prior, etarget, &eff_nseq); if (status == eslEMEM) ESL_XFAIL(status, bld->errbuf, "memory allocation failed"); else if (status != eslOK) ESL_XFAIL(status, bld->errbuf, "internal failure in entropy weighting algorithm"); hmm->eff_nseq = eff_nseq; } p7_hmm_Scale(hmm, hmm->eff_nseq / (double) hmm->nseq); return eslOK; ERROR: return status; }
static void utest_FLogsumError(ESL_GETOPTS *go, ESL_RANDOMNESS *r) { int N = esl_opt_GetInteger(go, "-N"); float maxval = esl_opt_GetReal(go, "-S"); int be_verbose = esl_opt_GetBoolean(go, "-v"); float maxerr = 0.0; float avgerr = 0.0; int i; float a,b,result,exact,err; for (i = 0; i < N; i++) { a = (esl_random(r) - 0.5) * maxval * 2.; /* uniform draws on -maxval..maxval */ b = (esl_random(r) - 0.5) * maxval * 2.; exact = log(exp(a) + exp(b)); result = p7_FLogsum(a,b); err = fabs(exact-result) / maxval; avgerr += err; maxerr = ESL_MAX(maxerr, err); if (be_verbose) printf("%8.4f %8.4f %8.4f %8.4f %8.4f\n", a, b, exact, result, err); } avgerr /= (float) N; if (be_verbose) { printf("average error = %f\n", avgerr); printf("max error = %f\n", maxerr); } if (maxerr > 0.0001) esl_fatal("maximum error of %f is too high: logsum unit test fails", maxerr); if (avgerr > 0.0001) esl_fatal("average error of %f is too high: logsum unit test fails", avgerr); }
/* Function: p7_GViterbi() * Synopsis: The Viterbi algorithm. * Incept: SRE, Tue Jan 30 10:50:53 2007 [Einstein's, St. Louis] * * Purpose: The standard Viterbi dynamic programming algorithm. * * Given a digital sequence <dsq> of length <L>, a profile * <gm>, and DP matrix <gx> allocated for at least <L> * by <gm->M> cells; calculate the maximum scoring path by * Viterbi; return the Viterbi score in <ret_sc>, and the * Viterbi matrix is in <gx>. * * The caller may then retrieve the Viterbi path by calling * <p7_GTrace()>. * * The Viterbi lod score is returned in nats. The caller * needs to subtract a null model lod score, then convert * to bits. * * Args: dsq - sequence in digitized form, 1..L * L - length of dsq * gm - profile. * gx - DP matrix with room for an MxL alignment * opt_sc - optRETURN: Viterbi lod score in nats * * Return: <eslOK> on success. */ int p7_GViterbi(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc) { float const *tsc = gm->tsc; float **dp = gx->dp; float *xmx = gx->xmx; int M = gm->M; int i,k; float esc = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY; /* Initialization of the zero row. */ XMX(0,p7G_N) = 0; /* S->N, p=1 */ XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE]; /* S->N->B, no N-tail */ XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY; /* need seq to get here */ for (k = 0; k <= gm->M; k++) MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY; /* need seq to get here */ /* DP recursion */ for (i = 1; i <= L; i++) { float const *rsc = gm->rsc[dsq[i]]; float sc; MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY; XMX(i,p7G_E) = -eslINFINITY; for (k = 1; k < gm->M; k++) { /* match state */ sc = ESL_MAX( MMX(i-1,k-1) + TSC(p7P_MM,k-1), IMX(i-1,k-1) + TSC(p7P_IM,k-1)); sc = ESL_MAX(sc, DMX(i-1,k-1) + TSC(p7P_DM,k-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,k-1)); MMX(i,k) = sc + MSC(k); /* E state update */ XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k) + esc); /* in Viterbi alignments, Dk->E can't win in local mode (and * isn't possible in glocal mode), so don't bother * looking. */ /* insert state */ sc = ESL_MAX(MMX(i-1,k) + TSC(p7P_MI,k), IMX(i-1,k) + TSC(p7P_II,k)); IMX(i,k) = sc + ISC(k); /* delete state */ DMX(i,k) = ESL_MAX(MMX(i,k-1) + TSC(p7P_MD,k-1), DMX(i,k-1) + TSC(p7P_DD,k-1)); } /* Unrolled match state M. */ sc = ESL_MAX( MMX(i-1,M-1) + TSC(p7P_MM,M-1), IMX(i-1,M-1) + TSC(p7P_IM,M-1)); sc = ESL_MAX(sc, DMX(i-1,M-1 ) + TSC(p7P_DM,M-1)); sc = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,M-1)); MMX(i,M) = sc + MSC(M); /* Unrolled delete state D_M * (Unlike internal Dk->E transitions that can never appear in * Viterbi alignments, D_M->E is possible in glocal mode.) */ DMX(i,M) = ESL_MAX(MMX(i,M-1) + TSC(p7P_MD,M-1), DMX(i,M-1) + TSC(p7P_DD,M-1)); /* E state update; transition from M_M scores 0 by def'n */ sc = ESL_MAX(XMX(i,p7G_E), MMX(i,M)); XMX(i,p7G_E) = ESL_MAX(sc, DMX(i,M)); /* Now the special states. E must already be done, and B must follow N,J. * remember, N, C and J emissions are zero score by definition. */ /* J state */ sc = XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP]; /* J->J */ XMX(i,p7G_J) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_LOOP]); /* E->J is E's "loop" */ /* C state */ sc = XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP]; XMX(i,p7G_C) = ESL_MAX(sc, XMX(i, p7G_E) + gm->xsc[p7P_E][p7P_MOVE]); /* N state */ XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP]; /* B state */ sc = XMX(i,p7G_N) + gm->xsc[p7P_N][p7P_MOVE]; /* N->B is N's move */ XMX(i,p7G_B) = ESL_MAX(sc, XMX(i,p7G_J) + gm->xsc[p7P_J][p7P_MOVE]); /* J->B is J's move */ } /* T state (not stored) */ if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + gm->xsc[p7P_C][p7P_MOVE]; gx->M = gm->M; gx->L = L; return eslOK; }
int main(int argc, char **argv) { ESL_GETOPTS *go = NULL; char *seqfile = NULL; ESL_SQFILE *sqfp = NULL; int infmt = eslSQFILE_UNKNOWN; int alphatype = eslUNKNOWN; ESL_ALPHABET *abc = NULL; ESL_SQ *sq = NULL; int64_t nseq = 0; int64_t nres = 0; int64_t small = 0; int64_t large = 0; double *monoc = NULL; /* monoresidue composition per sequence */ double *monoc_all = NULL; /* monoresidue composition over all seqs */ int do_comp = FALSE; int status = eslOK; int wstatus; int i; int x; /* Parse command line */ go = esl_getopts_Create(options); if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf); if (esl_opt_VerifyConfig(go) != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n", go->errbuf); if (esl_opt_GetBoolean(go, "-h") ) cmdline_help(argv[0], go); if (esl_opt_ArgNumber(go) != 1) cmdline_failure(argv[0], "Incorrect number of command line arguments.\n"); seqfile = esl_opt_GetArg(go, 1); do_comp = esl_opt_GetBoolean(go, "-c"); if (esl_opt_GetString(go, "--informat") != NULL) { infmt = esl_sqio_FormatCode(esl_opt_GetString(go, "--informat")); if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); } status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp); if (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Format of seqfile %s unrecognized.", seqfile); else if (status != eslOK) esl_fatal("Open failed, code %d.", status); if (esl_opt_GetBoolean(go, "--rna")) alphatype = eslRNA; else if (esl_opt_GetBoolean(go, "--dna")) alphatype = eslDNA; else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO; else { status = esl_sqfile_GuessAlphabet(sqfp, &alphatype); if (status == eslEAMBIGUOUS) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile); else if (status == eslEFORMAT) esl_fatal("Sequence file parse error, line %d of file %s:\n%s\n", sqfp->linenumber, seqfile, sqfp->errbuf); else if (status == eslENODATA) esl_fatal("Sequence file %s contains no data?", seqfile); else if (status != eslOK) esl_fatal("Failed to guess alphabet (error code %d)\n", status); } abc = esl_alphabet_Create(alphatype); sq = esl_sq_CreateDigital(abc); esl_sqfile_SetDigital(sqfp, abc); if (do_comp) { ESL_ALLOC(monoc, (abc->Kp) * sizeof(double)); ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double)); esl_vec_DSet(monoc_all, abc->Kp, 0.0); esl_vec_DSet(monoc, abc->Kp, 0.0); } while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF) { if (wstatus == eslOK) { if (do_comp) for (i = 1; i <= sq->n; i++) monoc[sq->dsq[i]]++; } else if (wstatus == eslEOD) { if (nseq == 0) { small = large = sq->L; } else { small = ESL_MIN(small, sq->L); large = ESL_MAX(large, sq->L); } if (esl_opt_GetBoolean(go, "-a")) { printf("= %-20s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : ""); } nres += sq->L; nseq++; esl_sq_Reuse(sq); if (do_comp) { esl_vec_DAdd(monoc_all, monoc, abc->Kp); esl_vec_DSet(monoc, abc->Kp, 0.0); } } else if (wstatus == eslEFORMAT) { esl_fatal("Failed to parse sequence at line %ld, file %s:\n%s", (long) sqfp->linenumber, sqfp->filename, sqfp->errbuf); } else esl_fatal("Failed in reading sequence:\n%s\n", sqfp->errbuf); } printf("Format: %s\n", esl_sqio_DescribeFormat(sqfp->format)); printf("Alphabet type: %s\n", esl_abc_DescribeType(abc->type)); printf("Number of sequences: %" PRId64 "\n", nseq); printf("Total # residues: %" PRId64 "\n", nres); printf("Smallest: %" PRId64 "\n", small); printf("Largest: %" PRId64 "\n", large); printf("Average length: %.1f\n", (float) nres / (float) nseq); if (do_comp) { printf("\nResidue composition:\n"); for (x = 0; x < abc->Kp; x++) if (x < abc->K || monoc_all[x] > 0) printf("residue: %c %10.0f %.4f\n", abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres); free(monoc); free(monoc_all); } esl_alphabet_Destroy(abc); esl_sq_Destroy(sq); esl_sqfile_Close(sqfp); esl_getopts_Destroy(go); return 0; ERROR: return status; }
/* Function: p7_ViterbiFilter() * Synopsis: Calculates Viterbi score, vewy vewy fast, in limited precision. * Incept: SRE, Tue Nov 27 09:15:24 2007 [Janelia] * * Purpose: Calculates an approximation of the Viterbi score for sequence * <dsq> of length <L> residues, using optimized profile <om>, * and a preallocated one-row DP matrix <ox>. Return the * estimated Viterbi score (in nats) in <ret_sc>. * * Score may overflow (and will, on high-scoring * sequences), but will not underflow. * * The model must be in a local alignment mode; other modes * cannot provide the necessary guarantee of no underflow. * * This is a striped SIMD Viterbi implementation using Intel * VMX integer intrinsics \citep{Farrar07}, in reduced * precision (signed words, 16 bits). * * Args: dsq - digital target sequence, 1..L * L - length of dsq in residues * om - optimized profile * ox - DP matrix * ret_sc - RETURN: Viterbi score (in nats) * * Returns: <eslOK> on success; * <eslERANGE> if the score overflows; in this case * <*ret_sc> is <eslINFINITY>, and the sequence can * be treated as a high-scoring hit. * * Throws: <eslEINVAL> if <ox> allocation is too small, or if * profile isn't in a local alignment mode. (Must be in local * alignment mode because that's what helps us guarantee * limited dynamic range.) * * Xref: [Farrar07] for ideas behind striped SIMD DP. * J2/46-47 for layout of HMMER's striped SIMD DP. * J2/50 for single row DP. * J2/60 for reduced precision (epu8) * J2/65 for initial benchmarking * J2/66 for precision maximization * J4/138-140 for reimplementation in 16-bit precision */ int p7_ViterbiFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc) { vector signed short mpv, dpv, ipv; /* previous row values */ vector signed short sv; /* temp storage of 1 curr row value in progress */ vector signed short dcv; /* delayed storage of D(i,q+1) */ vector signed short xEv; /* E state: keeps max for Mk->E as we go */ vector signed short xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector signed short Dmaxv; /* keeps track of maximum D cell on row */ int16_t xE, xB, xC, xJ, xN; /* special states' scores */ int16_t Dmax; /* maximum D cell score on row */ int i; /* counter over sequence positions 1..L */ int q; /* counter over vectors 0..nq-1 */ int Q; /* segment length: # of vectors */ vector signed short *dp; /* using {MDI}MX(q) macro requires initialization of <dp> */ vector signed short *rsc; /* will point at om->ru[x] for residue x[i] */ vector signed short *tsc; /* will point into (and step thru) om->tu */ vector signed short negInfv; Q = p7O_NQW(om->M); dp = ox->dpw[0]; /* Check that the DP matrix is ok for us. */ if (Q > ox->allocQ8) ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small"); if (om->mode != p7_LOCAL && om->mode != p7_UNILOCAL) ESL_EXCEPTION(eslEINVAL, "Fast filter only works for local alignment"); ox->M = om->M; negInfv = esl_vmx_set_s16((signed short)-32768); /* Initialization. In unsigned arithmetic, -infinity is -32768 */ for (q = 0; q < Q; q++) MMXo(q) = IMXo(q) = DMXo(q) = negInfv; xN = om->base_w; xB = xN + om->xw[p7O_N][p7O_MOVE]; xJ = -32768; xC = -32768; xE = -32768; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpVFRow(ox, 0, xE, 0, xJ, xB, xC); /* first 0 is <rowi>: do header. second 0 is xN: always 0 here. */ #endif for (i = 1; i <= L; i++) { rsc = om->rwv[dsq[i]]; tsc = om->twv; dcv = negInfv; /* "-infinity" */ xEv = negInfv; Dmaxv = negInfv; xBv = esl_vmx_set_s16(xB); /* Right shifts by 1 value (2 bytes). 4,8,12,x becomes x,4,8,12. * Because ia32 is littlendian, this means a left bit shift. * Zeros shift on automatically; replace it with -32768. */ mpv = MMXo(Q-1); mpv = vec_sld(negInfv, mpv, 14); dpv = DMXo(Q-1); dpv = vec_sld(negInfv, dpv, 14); ipv = IMXo(Q-1); ipv = vec_sld(negInfv, ipv, 14); for (q = 0; q < Q; q++) { /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */ sv = vec_adds(xBv, *tsc); tsc++; sv = vec_max (sv, vec_adds(mpv, *tsc)); tsc++; sv = vec_max (sv, vec_adds(ipv, *tsc)); tsc++; sv = vec_max (sv, vec_adds(dpv, *tsc)); tsc++; sv = vec_adds(sv, *rsc); rsc++; xEv = vec_max(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMXo(q); dpv = DMXo(q); ipv = IMXo(q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMXo(q) = sv; DMXo(q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = vec_adds(sv, *tsc); tsc++; Dmaxv = vec_max(dcv, Dmaxv); /* Calculate and store I(i,q) */ sv = vec_adds(mpv, *tsc); tsc++; IMXo(q)= vec_max(sv, vec_adds(ipv, *tsc)); tsc++; } /* Now the "special" states, which start from Mk->E (->C, ->J->B) */ xE = esl_vmx_hmax_s16(xEv); if (xE >= 32767) { *ret_sc = eslINFINITY; return eslERANGE; } /* immediately detect overflow */ xN = xN + om->xw[p7O_N][p7O_LOOP]; xC = ESL_MAX(xC + om->xw[p7O_C][p7O_LOOP], xE + om->xw[p7O_E][p7O_MOVE]); xJ = ESL_MAX(xJ + om->xw[p7O_J][p7O_LOOP], xE + om->xw[p7O_E][p7O_LOOP]); xB = ESL_MAX(xJ + om->xw[p7O_J][p7O_MOVE], xN + om->xw[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Finally the "lazy F" loop (sensu [Farrar07]). We can often * prove that we don't need to evaluate any D->D paths at all. * * The observation is that if we can show that on the next row, * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths * for all k, then we don't need any D->D calculations. * * The test condition is: * max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i) * So: * max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound; * max_k D(i,k) is why we tracked Dmaxv; * xB(i) was just calculated above. */ Dmax = esl_vmx_hmax_s16(Dmaxv); if (Dmax + om->ddbound_w > xB) { /* Now we're obligated to do at least one complete DD path to be sure. */ /* dcv has carried through from end of q loop above */ dcv = vec_sld(negInfv, dcv, 14); tsc = om->twv + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_adds(DMXo(q), *tsc); tsc++; } /* We may have to do up to three more passes; the check * is for whether crossing a segment boundary can improve * our score. */ do { dcv = vec_sld(negInfv, dcv, 14); tsc = om->twv + 7*Q; /* set tsc to start of the DD's */ for (q = 0; q < Q; q++) { if (! vec_any_gt(dcv, DMXo(q))) break; DMXo(q) = vec_max(dcv, DMXo(q)); dcv = vec_adds(DMXo(q), *tsc); tsc++; } } while (q == Q); } else /* not calculating DD? then just store the last M->D vector calc'ed.*/ DMXo(0) = vec_sld(negInfv, dcv, 14); #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpVFRow(ox, i, xE, 0, xJ, xB, xC); #endif } /* end loop over sequence residues 1..L */ /* finally C->T */ if (xC > -32768) { *ret_sc = (float) xC + (float) om->xw[p7O_C][p7O_MOVE] - (float) om->base_w; /* *ret_sc += L * om->ncj_roundoff; see J4/150 for rationale: superceded by -3.0nat approximation*/ *ret_sc /= om->scale_w; *ret_sc -= 3.0; /* the NN/CC/JJ=0,-3nat approximation: see J5/36. That's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ contrib */ } else *ret_sc = -eslINFINITY; return eslOK; }
/* Function: p7_null3_score() * * Purpose: Calculate a correction (in log_2 odds) to be applied * to a sequence, using a null model based on the * composition of the target sequence. * The null model is constructed /post hoc/ as the * distribution of the target sequence; if the target * sequence is 40% A, 5% C, 5% G, 40% T, then the null * model is (0.4, 0.05, 0.05, 0.4). This function is * based heavily on Infernal's ScoreCorrectionNull3(), * with two important changes: * - it leaves the log2 conversion from NATS to BITS * for the calling function. * - it doesn't include the omega score modifier * (based on prior probability of using the null3 * model), again leaving this to the calling function. * * Args: abc - alphabet for hit (only used to get alphabet size) * dsq - the sequence the hit resides in * tr - trace of the alignment, used to find the match states * (non-match chars are ignored in computing freq, not used if NULL) * start - start position of hit in dsq * stop - end position of hit in dsq * bg - background, used for the default null model's emission freq * ret_sc - RETURN: the correction to the score (in NATS); * caller subtracts this from hit score to get * corrected score. * Return: void, ret_sc: the log-odds score correction (in NATS). */ void p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc) { float score = 0.; int status; int i; float *freq; int dir; int tr_pos; ESL_ALLOC(freq, sizeof(float) * abc->K); esl_vec_FSet(freq, abc->K, 0.0); /* contract check */ if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", ""); if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", ""); if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", ""); dir = start < stop ? 1 : -1; if (tr != NULL) { /* skip the parts of the trace that precede the first match state */ tr_pos = 2; i = start; while (tr->st[tr_pos] != p7T_M) { if (tr->st[tr_pos] == p7T_N) i += dir; tr_pos++; } /* tally frequencies from characters hitting match state*/ while (tr->st[tr_pos] != p7T_E) { if (tr->st[tr_pos] == p7T_M) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } if (tr->st[tr_pos] != p7T_D ) i += dir; tr_pos++; } } else { /* tally frequencies from the full envelope */ for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++) { if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", ""); esl_abc_FCount(abc, freq, dsq[i], 1.); } } esl_vec_FNorm(freq, abc->K); /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/ for (i = 0; i < abc->K; i++) score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ; /* Return the correction to the bit score. */ score = p7_FLogsum(0., score); *ret_sc = score; return; ERROR: esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", ""); return; /* never reached */ }