/* Function: p7_anchors_SampleFromTrace() * Synopsis: Make a reasonable anchor set from a trace. * * Purpose: Make a reasonable anchor set from trace <tr>, by * randomly sampling a match state in each domain. * Return the anchor set in <anch>, which will be * reallocated if needed. * * <tr> must be indexed by the caller with <p7_trace_Index()>. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on reallocation failure. */ int p7_anchors_SampleFromTrace(P7_ANCHORS *anch, ESL_RANDOMNESS *rng, const P7_TRACE *tr) { int D = tr->ndom; int d,z,w; int nM; int status; if ((status = p7_anchors_Resize(anch, D)) != eslOK) goto ERROR; for (d = 1; d <= D; d++) { for (nM = 0, z = tr->tfrom[d-1]; z <= tr->tto[d-1]; z++) // P7_TRACE numbers domains 0..D-1, off by one from P7_ANCHORS if (p7_trace_IsM(tr->st[z])) nM++; ESL_DASSERT1(( nM )); w = 1+esl_rnd_Roll(rng, nM); // w = 1..nM : choice of which M state to make the anchor for ( z = tr->tfrom[d-1]; w; z++) // when w reaches 0, tr->st[z] is the M state we want to make the anchor, and we break out; there's a final z++, so the state we want ends up being z-1 if (p7_trace_IsM(tr->st[z])) w--; ESL_DASSERT1(( p7_trace_IsM(tr->st[z-1]) )); // since the logic above is overly elegant... better doublecheck. anch->a[d].i0 = tr->i[z-1]; anch->a[d].k0 = tr->k[z-1]; } p7_anchor_SetSentinels(anch->a, D, tr->L, tr->M); anch->D = D; return eslOK; ERROR: return status; }
/* Function: esl_msaweight_IDFilter() * Synopsis: Filter by %ID. * Incept: ER, Wed Oct 29 10:06:43 2008 [Janelia] * * Purpose: Constructs a new alignment by removing near-identical * sequences from a given alignment (where identity is * calculated *based on the alignment*). * Does not affect the given alignment. * Keeps earlier sequence, discards later one. * * Usually called as an ad hoc sequence "weighting" mechanism. * * Limitations: * Unparsed Stockholm markup is not propagated into the * new alignment. * * Return: <eslOK> on success, and the <newmsa>. * * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise * identity calculation fails because of corrupted sequence * data. In either case, the <msa> is unmodified. * * Xref: squid::weight.c::FilterAlignment(). */ int esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa) { int *list = NULL; /* array of seqs in new msa */ int *useme = NULL; /* TRUE if seq is kept in new msa */ int nnew; /* number of seqs in new alignment */ double ident; /* pairwise percentage id */ int i,j; /* seqs counters*/ int remove; /* TRUE if sq is to be removed */ int status; /* Contract checks */ ESL_DASSERT1( (msa != NULL) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); /* allocate */ ESL_ALLOC(list, sizeof(int) * msa->nseq); ESL_ALLOC(useme, sizeof(int) * msa->nseq); esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */ /* find which seqs to keep (list) */ nnew = 0; for (i = 0; i < msa->nseq; i++) { remove = FALSE; for (j = 0; j < nnew; j++) { if (! (msa->flags & eslMSA_DIGITAL)) { if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #ifdef eslAUGMENT_ALPHABET else { if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR; } #endif if (ident > maxid) { remove = TRUE; break; } } if (remove == FALSE) { list[nnew++] = i; useme[i] = TRUE; } } if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR; free(list); free(useme); return eslOK; ERROR: if (list != NULL) free(list); if (useme != NULL) free(useme); return status; }
static double sxp_complete_binned_func(double *p, int np, void *dptr) { struct sxp_binned_data *data = (struct sxp_binned_data *) dptr; ESL_HISTOGRAM *g = data->g; double logL = 0.; double ai, bi; /* lower, upper bounds on bin */ double lambda, tau; int i; double tmp; lambda = exp(p[0]); tau = exp(p[1]); ESL_DASSERT1(( ! isnan(lambda) )); ESL_DASSERT1(( ! isnan(tau) )); for (i = g->cmin; i <= g->imax; i++) /* for each occupied bin */ { if (g->obs[i] == 0) continue; ai = esl_histogram_Bin2LBound(g, i); bi = esl_histogram_Bin2UBound(g, i); if (ai < data->mu) ai = data->mu; /* careful at leftmost bound */ tmp = esl_sxp_cdf(bi, data->mu, lambda, tau) - esl_sxp_cdf(ai, data->mu, lambda, tau); if (tmp == 0.) return eslINFINITY; logL += g->obs[i] * log(tmp); } return -logL; /* minimizing NLL */ }
int p7_mpas_stats_CompareAS2Trace(P7_MPAS_STATS *stats, const P7_ANCHORS *anch, const P7_TRACE *tr) { int ad; int td = 0; int anch_in_this_td = 0; stats->anch_outside = 0; stats->anch_unique = 0; stats->anch_multiple = 0; stats->dom_zero = 0; stats->dom_one = 0; stats->dom_multiple = 0; /* For n domains in tr: * they can either be hit 0 times, 1 time, or 2+ times by anchors. * For m anchors in anch: * they can either fall outside any domain, uniquely in a domain, or multiply in a domain. * * Watch out: ad (in anchor set) is 1..D; td (in trace) is 0..D-1. */ for (ad = 1; ad <= anch->D; ad++) { if (anch->a[ad].i0 < tr->sqfrom[td] || td == tr->ndom) stats->anch_outside++; else if (anch->a[ad].i0 >= tr->sqfrom[td] && anch->a[ad].i0 <= tr->sqto[td]) anch_in_this_td++; else { /* we have to advance <td>, and try again */ if (anch_in_this_td == 0) { stats->dom_zero++; } else if (anch_in_this_td == 1) { stats->anch_unique++; stats->dom_one++; } else if (anch_in_this_td > 1) { stats->anch_multiple += anch_in_this_td; stats->dom_multiple++; } anch_in_this_td = 0; td++; ad--; /* forces reevaluation of <ad> when we go back around; a bit hacky! */ } } /* we're out of anchors. If td == tr->ndom, we also know we * handled what happened with anchors in the last <td>. But if * td == tr->ndom-1, we haven't yet resolved what happened with final <td> yet, * and if td is even smaller, we have some dom_zero's to count. */ for (; td < tr->ndom; td++) { if (anch_in_this_td == 0) { stats->dom_zero++; } else if (anch_in_this_td == 1) { stats->anch_unique++; stats->dom_one++; } else if (anch_in_this_td > 1) { stats->anch_multiple += anch_in_this_td; stats->dom_multiple++; } anch_in_this_td = 0; } ESL_DASSERT1(( stats->dom_zero + stats->dom_one + stats->dom_multiple == tr->ndom )); ESL_DASSERT1(( stats->anch_outside + stats->anch_unique + stats->anch_multiple == anch->D )); stats->has_part2 = TRUE; return eslOK; }
/* Function: p7_masstrace_CountTrace() * Synopsis: Count domain endpoints into endpoint distributions. * * Purpose: Given a traceback <tr>, determine if it contains a domain * specified by the anchor point <i0>, <k0>, <st0>. If it * doesn't, return without doing anything. If it does, * count that domain's start/end points into the <mt> * structure, and update the <*ntr> count by one. * * This function is useful in unit tests that approximate * the mass trace calculation using a large ensemble of * stochastic tracebacks. * * Before the first <_CountTrace()> call on an <mt>, you * call <p7_masstrace_Zero()> on it to initialize it. * * The counts in <mt> are collected as a histogram; after * an entire ensemble has been collected, <mt> needs to be * converted to a cumulative distribution. * * A special case arises exactly at the 'midpoints' in the * <mt> vectors <kmass> and <imass>. <kmass[k]> will be the * start point cumulative distribution P(ka <= k) for * k<=k0, and the end point cumulative distribution P(kb >= * k) for k>=k0. In a cumulative distributoin, it's ok that * kmass[k0] is defined as both the start and end value, * because it's 1.0 in both cases. But in a histogram, we * would have to distinguish whether kmass[k0] has seen a * start ka versus an end kb. Instead of doing something * special to handle this, instead we don't count kmass[k0] * (or imass[i0]) at all; and when we convert to a * cumulative distribution, we'll set these to 1.0. * * Because this is counting in single-precision floating * point arithmetic, it can't accurately count an ensemble * of more than about $10^7$ traces. * * Args: tr - trace structure * i0 - i sequence position coord of domain anchor * k0 - k model position coord of domain anchor * st0 - a main model {MID}{LG} state type of domain anchor * mt - mass trace object to count endpoints in * ntr - updated number of traces that contain the anchor * * Returns: <eslOK> on success. "Success" includes ignoring a trace * that does not contain the anchor <i0>,<k0>,<st0>. If the * trace does contain the anchor, start/endpoint counts in * <mt> are incremented by one, and <*ntr> is incremented * by one. */ int p7_masstrace_CountTrace(const P7_TRACE *tr, int i0, int k0, int st0, P7_MASSTRACE *mt, int *ntr) { int i,z0,z; int ia,ib, ka,kb; int foundit = FALSE; /* Contract checks on arguments */ ESL_DASSERT1( ( i0>=1 && i0 <= tr->L) ); ESL_DASSERT1( ( k0>=1 && k0 <= tr->M) ); ESL_DASSERT1( ( p7_trace_IsMain(st0)) ); ESL_DASSERT1( ( mt->i0 == 0 || mt->i0 == i0) ); ESL_DASSERT1( ( mt->k0 == 0 || mt->k0 == k0) ); ESL_DASSERT1( ( mt->st0 == 0 || mt->st0 == st0) ); ESL_DASSERT1( ( mt->L == tr->L) ); ESL_DASSERT1( ( mt->M == tr->M) ); /* Find the anchor, if it's there. */ for (i=0, z0 = 0; z0 < tr->N; z0++) { if (tr->i[z0]) i = tr->i[z0]; /* update i. only emitting states have tr->i[z] set */ if (i > i0 ) break; /* failed to find anchor. */ if (i == i0 && tr->st[z0] == st0 && tr->k[z0] == k0) { foundit = TRUE; break; } } if (! foundit) return eslOK; /* If no anchor: successful return, ignoring this trace. */ /* Find leftmost bounds of domain */ for (ia = i0, ka = k0, z = z0; z >= 0 && tr->st[z] != p7T_B; z--) { if (tr->i[z]) ia = tr->i[z]; if (tr->k[z]) ka = tr->k[z]; } ESL_DASSERT1( ( tr->st[z] == p7T_B) ); /* Find rightmost bounds of domain */ for (ib = i0, kb = k0, z = z0; z < tr->N && tr->st[z] != p7T_E; z++) { if (tr->i[z]) ib = tr->i[z]; if (tr->k[z]) kb = tr->k[z]; } ESL_DASSERT1( ( tr->st[z] == p7T_E) ); /* Increment counters */ if (ka < k0) mt->kmass[ka] += 1.; /* note the guards against incrementing the overlapped start/end at k0,i0 */ if (kb > k0) mt->kmass[kb] += 1.; if (mt->imass && ia < i0) mt->imass[ia] += 1.; /* also, guard for the optional <imass> data in <mt> */ if (mt->imass && ib > i0) mt->imass[ib] += 1.; *ntr += 1; /* Make sure i0,k0,st0 are set. */ mt->i0 = i0; mt->k0 = k0; mt->st0 = st0; return eslOK; }
/* guaranteed s1 >= -INFTY, p2 >= -INFTY */ int ILogsumNI(int s1, int s2) { ESL_DASSERT1((s1 >= -INFTY)); ESL_DASSERT1((s2 >= -INFTY)); const int diff = s1-s2; if (diff >= LOGSUM_TBL) return s1; else if (diff <= -LOGSUM_TBL) return s2; else if (diff > 0) return s1 + ilogsum_lookup[diff]; else return s2 + ilogsum_lookup[-diff]; }
static int a2m_padding_digital(ESL_MSA *msa, char **csflag, int *nins, int ncons) { ESL_DSQ *ax = NULL; /* new aligned sequence - will be swapped into msa->ax[] */ ESL_DSQ gapsym = esl_abc_XGetGap(msa->abc); int apos, cpos, spos; /* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */ int alen; int icount; int idx; int status; alen = ncons; for (cpos = 0; cpos <= ncons; cpos++) alen += nins[cpos]; ESL_ALLOC(msa->rf, sizeof(char) * (alen+1)); for (apos = 0, cpos = 0; cpos <= ncons; cpos++) { for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.'; if (cpos < ncons) msa->rf[apos++] = 'x'; } msa->rf[apos] = '\0'; for (idx = 0; idx < msa->nseq; idx++) { ESL_ALLOC(ax, sizeof(ESL_DSQ) * (alen + 2)); ax[0] = eslDSQ_SENTINEL; apos = spos = 0; for (cpos = 0; cpos <= ncons; cpos++) { icount = 0; while (csflag[idx][spos] == FALSE) { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; icount++; } while (icount < nins[cpos]) { ax[apos+1] = gapsym; apos++; icount++; } if (cpos < ncons) { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; } } ESL_DASSERT1( (msa->ax[idx][spos+1] == eslDSQ_SENTINEL) ); ESL_DASSERT1( (apos == alen) ); ax[alen+1] = eslDSQ_SENTINEL; free(msa->ax[idx]); msa->ax[idx] = ax; ax = NULL; } msa->alen = alen; return eslOK; ERROR: if (ax) free(ax); return status; }
/* Function: p7_masstrace_Zero() * Synopsis: Initialize cumulative endpoint distributions to zeros. * * Purpose: Zero the cumulative distributions in <mt>, preparing to * collect masstrace endpoint data for a sequence of length * <L> and a profile of length <M>. * * Args: mt - mass trace object to collect endpoint data in * M - profile length * L - sequence length * * Returns: <eslOK> on success. */ int p7_masstrace_Zero(P7_MASSTRACE *mt, int M, int L) { /* contract checks / argument validation */ ESL_DASSERT1( (mt->imass == NULL || L+2 <= mt->ialloc ) ); ESL_DASSERT1( (M+2 <= mt->kalloc) ); if (mt->imass) esl_vec_FSet(mt->imass, L+2, 0.0f); esl_vec_FSet(mt->kmass, M+2, 0.0f); mt->L = L; mt->M = M; return eslOK; }
/* Return the negative gradient at a point, determined * numerically. */ static void numeric_derivative(double *x, double *u, int n, double (*func)(double *, int, void*), void *prm, double relstep, double *dx) { int i; double delta; double f1, f2; double tmp; for (i = 0; i < n; i++) { delta = fabs(u[i] * relstep); tmp = x[i]; x[i] = tmp + delta; f1 = (*func)(x, n, prm); x[i] = tmp - delta; f2 = (*func)(x, n, prm); x[i] = tmp; dx[i] = (-0.5 * (f1-f2)) / delta; ESL_DASSERT1((! isnan(dx[i]))); } }
/* Using FChoose() here would mean allocating tmp space for 2M-1 paths; * instead we use the fact that E(i) is itself the necessary normalization * factor, and implement FChoose's algorithm here for an on-the-fly * calculation. * Note that that means double-precision calculation, to be sure 0.0 <= roll < 1.0 */ static inline int select_e(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int *ret_k) { int Q = p7O_NQF(ox->M); double sum = 0.0; double roll = esl_random(rng); double norm = 1.0 / ox->xmx[i*p7X_NXCELLS+p7X_E]; __m128 xEv = _mm_set1_ps(norm); /* all M, D already scaled exactly the same */ union { __m128 v; float p[4]; } u; int q,r; while (1) { for (q = 0; q < Q; q++) { u.v = _mm_mul_ps(ox->dpf[i][q*3 + p7X_M], xEv); for (r = 0; r < 4; r++) { sum += u.p[r]; if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_M;} } u.v = _mm_mul_ps(ox->dpf[i][q*3 + p7X_D], xEv); for (r = 0; r < 4; r++) { sum += u.p[r]; if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_D;} } } ESL_DASSERT1((sum > 0.99)); } /*UNREACHED*/ ESL_EXCEPTION(-1, "unreached code was reached. universe collapses."); }
/* Function: p7_coords2_hash_Create() * Synopsis: Create a <P7_COORDS2_HASH> * * Purpose: Allocate and initialize a <P7_COORDS2_HASH> hash table for storing * lots of coord2 arrays (i.e. domain annotations). * * The <init_*> arguments let you set non-default initial * allocation sizes. To use the default for any of these, * pass a 0 value. Defaults are 128 for the initial * hashtable size <init_hashsize>; 128 for the initial * allocation for number of keys to be stored <init_nkeyalloc>; * and 2048 for the initial allocation for the number * of integers to be stored in key data. * * In general the initialization defaults should be * fine. All three are grown automatically as needed, as * you add keys to the hash. * * "key data" means <n> <start>/<end> pairs, plus <n> * itself: it takes 2n+1 integers to store a <P7_COORD2> * array of length <n>. * * <hashsize> must be a power of 2; remember that if you * pass a non-default value. * * Args: init_hashsize : initial hashtable size. Power of 2; >0. * init_keyalloc : initial allocation for # keys. >0. * init_calloc : initial allocation for key data. >0. * * Returns: pointer to the new <P7_COORDS2_HASH> object on success. * * Throws: <NULL> on allocation failure. */ P7_COORDS2_HASH * p7_coords2_hash_Create(int32_t init_hashsize, int32_t init_nkeyalloc, int32_t init_calloc) { P7_COORDS2_HASH *ch = NULL; int32_t i; int status; ESL_DASSERT1(( init_hashsize == 0 || (init_hashsize && ((init_hashsize & (init_hashsize-1)) == 0)))); /* hashsize is a power of 2 (bitshifting trickery) */ ESL_ALLOC(ch, sizeof(P7_COORDS2_HASH)); ch->hashtable = NULL; ch->key_offset = NULL; ch->nxt = NULL; ch->cmem = NULL; ch->nkeys = 0; ch->cn = 0; ch->hashsize = (init_hashsize > 0 ? init_hashsize : 128); ch->kalloc = (init_nkeyalloc > 0 ? init_nkeyalloc : 128); ch->calloc = (init_calloc > 0 ? init_calloc : 2048); ESL_ALLOC(ch->hashtable, sizeof(int32_t) * ch->hashsize); for (i = 0; i < ch->hashsize; i++) ch->hashtable[i] = -1; ESL_ALLOC(ch->key_offset, sizeof(int32_t) * ch->kalloc); ESL_ALLOC(ch->nxt, sizeof(int32_t) * ch->kalloc); ESL_ALLOC(ch->cmem, sizeof(int32_t) * ch->calloc); return ch; ERROR: p7_coords2_hash_Destroy(ch); return NULL; }
/* Function: p7_anchors_Resize() * Synopsis: Reallocate a P7_ANCHORS object, if necessary * * Purpose: Make sure that <anch> can hold an array of * at least <D> anchors. * * Does not alter any data that are already stored * in <anch>, so it's safe to resize an anchor * array that we're growing incrementally (as in * segmental divide and conquer MPAS algorithm). * * D=0 is a valid argument and may occur in normal use; it * results in a no-op, because the structure is always big * enough to hold zero anchors. * * Xref: First example of a new pattern for how we * can handle reallocation/reuse strategy, * replacing _Reinit() and _Grow() interfaces. * [SRE:J14/1] */ int p7_anchors_Resize(P7_ANCHORS *anch, int D) { int nalloc; int status; /* Contract checks, argument validation */ ESL_DASSERT1(( anch->nalloc > 0 )); if (D+2 <= anch->nalloc) return eslOK; // If we're big enough already, do nothing; else if (D+2 < anch->nredline || anch->D > 0) // If we're under the redline max, or if it looks like { // we're building the anchor array incrementally, nalloc = anch->nalloc; // we reallocate by doubling, trying to minimize while (nalloc < D+2) nalloc *= 2; // the need for more reallocations soon. } // If we're over redline AND it looks like we're else nalloc = D+2; // starting an empty object, allocate exactly. // Now nalloc will probably not be a multiple of two -- // but the next _Reuse() call will pull it back // to the redline, which is. ESL_REALLOC(anch->a, sizeof(P7_ANCHOR) * nalloc); anch->nalloc = nalloc; return eslOK; ERROR: return status; }
/* Function: p7_filtermx_GrowTo() * Synopsis: Resize filter DP matrix for new profile size. * * Purpose: Given an existing filter matrix structure <fx>, * and the dimension <M> of a new profile that * we're going to use (in consensus positions), * assure that <fx> is large enough for such a * profile; reallocate and reinitialize as needed. * * <p7_filtermx_Reuse(fx); p7_filtermx_GrowTo(fx, M)> * is essentially equivalent to <p7_filtermx_Create(M)>, * while minimizing reallocation. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. The state of * <fx> is now undefined, and it should not be used. */ int p7_filtermx_GrowTo_avx(P7_FILTERMX *fx, int allocM) { int status; /* Contract checks / argument validation */ ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) ); #ifdef HAVE_AVX2 /* is it already big enough? */ if (allocM <= fx->allocM_AVX) return eslOK; /* if not, grow it */ ESL_REALLOC(fx->dp_mem_AVX, (sizeof(__m256i) * (p7F_NSCELLS * P7_NVW_AVX(allocM))) + (p7_VALIGN_AVX-1)); fx->allocM_AVX = allocM; fx->dp_AVX = (__m256i *) ( (unsigned long int) ( (char *) fx->dp_mem_AVX + (p7_VALIGN_AVX-1)) & p7_VALIMASK_AVX); return eslOK; ERROR: return status; #endif //HAVE_AVX2 #ifndef HAVE_AVX2 return eslENORESULT; #endif }
/* Function: p7_filtermx_GrowTo() * Synopsis: Resize filter DP matrix for new profile size. * * Purpose: Given an existing filter matrix structure <fx>, * and the dimension <M> of a new profile that * we're going to use (in consensus positions), * assure that <fx> is large enough for such a * profile; reallocate and reinitialize as needed. * * <p7_filtermx_Reuse(fx); p7_filtermx_GrowTo(fx, M)> * is essentially equivalent to <p7_filtermx_Create(M)>, * while minimizing reallocation. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. The state of * <fx> is now undefined, and it should not be used. */ int p7_filtermx_GrowTo_neon64(P7_FILTERMX *fx, int allocM) { #ifdef HAVE_NEON64 int status; /* Contract checks / argument validation */ ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) ); if (allocM <= fx->allocM) return eslOK; /* if not, grow it */ ESL_REALLOC(fx->dp_mem, (sizeof(esl_neon_128i_t) * (p7F_NSCELLS * P7_NVW(allocM))) + (p7_VALIGN-1)); fx->allocM = allocM; fx->dp = (esl_neon_128i_t *) ( (unsigned long int) ( (char *) fx->dp_mem + (p7_VALIGN-1)) & p7_VALIMASK); return eslOK; ERROR: return status; #endif //HAVE_NEON64 #ifndef HAVE_NEON64 return eslENORESULT; #endif }
static int a2m_padding_text(ESL_MSA *msa, char **csflag, int *nins, int ncons) { char *aseq = NULL; /* new aligned sequence - will be swapped into msa->aseq[] */ int apos, cpos, spos; /* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */ int alen; int icount; int idx; int status; alen = ncons; for (cpos = 0; cpos <= ncons; cpos++) alen += nins[cpos]; ESL_ALLOC(msa->rf, sizeof(char) * (alen+1)); for (apos = 0, cpos = 0; cpos <= ncons; cpos++) { for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.'; if (cpos < ncons) msa->rf[apos++] = 'x'; } msa->rf[apos] = '\0'; for (idx = 0; idx < msa->nseq; idx++) { ESL_ALLOC(aseq, sizeof(char) * (alen + 1)); apos = spos = 0; for (cpos = 0; cpos <= ncons; cpos++) { icount = 0; while (csflag[idx][spos] == FALSE) { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; icount++; } while (icount < nins[cpos]) { aseq[apos] = '.'; apos++; icount++; } if (cpos < ncons) { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; } } ESL_DASSERT1( (msa->aseq[idx][spos] == '\0') ); ESL_DASSERT1( (apos == alen) ); aseq[alen] = '\0'; free(msa->aseq[idx]); msa->aseq[idx] = aseq; aseq = NULL; } msa->alen = alen; return eslOK; ERROR: if (aseq) free(aseq); return status; }
/* Function: p7_filtermx_DumpMFRow() * Synopsis: Dump one row from MSV version of a DP matrix. * * Purpose: Dump current row of MSV calculations from DP matrix <fx> * for diagnostics, and include the values of specials * <xE>, etc. The index <rowi> for the current row is used * as a row label. This routine has to be specialized for * the layout of the MSVFilter() row, because it's all * match scores dp[0..q..Q-1], rather than triplets of * M,D,I. * * If <rowi> is 0, print a header first too. * * The output format is coordinated with <p7_refmx_Dump()> to * facilitate comparison to a known answer. * * This also works for an SSV filter row, for SSV implementations * that use a single row of DP memory (like <_longtarget>). * The Knudsen assembly code SSV does not use any RAM. * * Returns: <eslOK> on success. * * Throws: <eslEMEM> on allocation failure. */ int p7_filtermx_DumpMFRow_neon64(const P7_FILTERMX *fx, int rowi, uint8_t xE, uint8_t xN, uint8_t xJ, uint8_t xB, uint8_t xC) { #ifdef HAVE_NEON64 int Q = P7_NVB(fx->M); /* number of vectors in the MSV row */ uint8_t *v = NULL; /* array of scores after unstriping them */ int q,z,k; union { esl_neon_128i_t v; uint8_t i[16]; } tmp; int status; ESL_DASSERT1( (fx->type == p7F_MSVFILTER || fx->type == p7F_SSVFILTER) ); /* We'll unstripe the whole row; then print it in its normal order. */ ESL_ALLOC(v, sizeof(unsigned char) * ((Q*16)+1)); v[0] = 0; /* Header (if we're on the 0th row) */ if (rowi == 0) { fprintf(fx->dfp, " "); for (k = 0; k <= fx->M; k++) fprintf(fx->dfp, "%3d ", k); fprintf(fx->dfp, "%3s %3s %3s %3s %3s\n", "E", "N", "J", "B", "C"); fprintf(fx->dfp, " "); for (k = 0; k <= fx->M+5; k++) fprintf(fx->dfp, "%3s ", "---"); fprintf(fx->dfp, "\n"); } /* Unpack and unstripe, then print M's. */ for (q = 0; q < Q; q++) { tmp.v = fx->dp[q]; for (z = 0; z < 16; z++) v[q+Q*z+1] = tmp.i[z]; } fprintf(fx->dfp, "%4d M ", rowi); for (k = 0; k <= fx->M; k++) fprintf(fx->dfp, "%3d ", v[k]); /* The specials */ fprintf(fx->dfp, "%3d %3d %3d %3d %3d\n", xE, xN, xJ, xB, xC); /* I's are all 0's; print just to facilitate comparison to refmx. */ fprintf(fx->dfp, "%4d I ", rowi); for (k = 0; k <= fx->M; k++) fprintf(fx->dfp, "%3d ", 0); fprintf(fx->dfp, "\n"); /* D's are all 0's too */ fprintf(fx->dfp, "%4d D ", rowi); for (k = 0; k <= fx->M; k++) fprintf(fx->dfp, "%3d ", 0); fprintf(fx->dfp, "\n\n"); free(v); return eslOK; ERROR: free(v); return status; #endif //HAVE_NEON64 #ifndef HAVE_NEON64 return eslENORESULT; #endif }
/* guaranteed s1 >= -INFTY, s2 >= -INFTY */ int ILogsumNI(int s1, int s2) { ESL_DASSERT1((s1 > -INFTY)); ESL_DASSERT1((s2 > -INFTY)); /*assert(s1 > -INFTY); assert(s2 > -INFTY);*/ const int max = ESL_MAX(s1, s2); const int min = ESL_MIN(s1, s2); return ((max-min) >= LOGSUM_TBL) ? max : max + ilogsum_lookup[max-min]; /* about 10% slower if(s1 > s2) return ((s1-s2) >= LOGSUM_TBL) ? s1 : s1 + ilogsum_lookup[s1-s2]; else return ((s2-s1) >= LOGSUM_TBL) ? s2 : s2 + ilogsum_lookup[s2-s1]; */ }
/* guaranteed s1 >= -INFTY, s2 >= -INFTY */ int ILogsumNI_diff(int s1a, int s1b, int s2a, int s2b, int db) { /* db = s1b - s2b */ ESL_DASSERT1((s1a > -INFTY)); ESL_DASSERT1((s1b > -INFTY)); ESL_DASSERT1((s2a > -INFTY)); ESL_DASSERT1((s2b > -INFTY)); /*const int d = s1a-s2a+db; if (d >= LOGSUM_TBL) return s1a + s1b; else if (d > 0) return s1a + s1b + ilogsum_lookup[d]; else if (d <= -LOGSUM_TBL) return s2a + s2b; else return s2a + s2b + ilogsum_lookup[-d];*/ const int d = s1a-s2a+db; if(d > 0) return (d >= LOGSUM_TBL) ? s1a + s1b : s1a + s1b + ilogsum_lookup[d]; else return (d <= LOGSUM_TBL) ? s2a + s2b : s2a + s2b + ilogsum_lookup[-d]; }
/* Function: esl_msaweight_PB() * Synopsis: PB (position-based) weights. * Incept: SRE, Sun Nov 5 08:59:28 2006 [Janelia] * * Purpose: Given a multiple alignment <msa>, calculate sequence * weights according to the position-based weighting * algorithm (Henikoff and Henikoff, JMB 243:574-578, * 1994). These weights are stored internally in the <msa> * object, replacing any weights that may have already been * there. Weights are $\geq 0$ and they sum to <msa->nseq>. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that the algorithm * deals with degenerate residue symbols properly. * * The Henikoffs' algorithm does not give rules for dealing * with gaps or degenerate residue symbols. The rule here * is to ignore them. This means that longer sequences * initially get more weight; hence a "double * normalization" in which the weights are first divided by * sequence length in canonical residues (to compensate for * that effect), then normalized to sum to nseq. * * An advantage of the PB method is efficiency. * It is $O(1)$ in memory and $O(NL)$ time, for an alignment of * N sequences and L columns. This makes it a good method * for ad hoc weighting of very deep alignments. * * When the alignment is in simple text mode, IUPAC * degenerate symbols are not dealt with correctly; instead, * the algorithm simply uses the 26 letters as "residues" * (case-insensitively), and treats all other residues as * gaps. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEMEM> on allocation error, in which case <msa> is * returned unmodified. * * Xref: [Henikoff94b]; squid::weight.c::PositionBasedWeights(). */ int esl_msaweight_PB(ESL_MSA *msa) { int *nres = NULL; /* counts of each residue observed in a column */ int ntotal; /* number of different symbols observed in a column */ int rlen; /* number of residues in a sequence */ int idx, pos, i; int K; /* alphabet size */ int status; /* Contract checks */ ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } /* Initialize */ if (! (msa->flags & eslMSA_DIGITAL)) { ESL_ALLOC_WITH_TYPE(nres, int*, sizeof(int) * 26); K = 26; }
/* Function: esl_sxp_cdf() * * Purpose: Calculates the cumulative distribution function for the * stretched exponential pdf, $P(X \leq x)$, given * quantile <x>, offset <mu>, and parameters <lambda> and <tau>. */ double esl_sxp_cdf(double x, double mu, double lambda, double tau) { double y = lambda * (x-mu); double val; if (x <= mu) return 0.; esl_stats_IncompleteGamma(1/tau, exp(tau * log(y)), &val, NULL); ESL_DASSERT1 (( !isnan(val))); return val; }
/* jukescantor() * * The generalized Jukes/Cantor distance calculation. * Given <n1> identities and <n2> differences, for a * base alphabet size of <alphabet_size> (4 or 20); * calculate J/C distance in substitutions/site and * return it in <ret_distance>; calculate large-sample * variance and return it in <ret_variance>. * * Returns <eslEDIVZERO> if there are no data (<n1+n2=0>). */ static int jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance) { int status; double D, K, N; double x; double distance, variance; ESL_DASSERT1( (n1 >= 0) ); ESL_DASSERT1( (n2 >= 0) ); ESL_DASSERT1( (alphabet_size >= 0) ); if (n1+n2 == 0) { status = eslEDIVZERO; goto ERROR; } K = (double) alphabet_size; D = (double) n2 / (double) (n1+n2); N = (double) (n1+n2); x = 1. - D * K/(K-1.); if (x <= 0.) { distance = HUGE_VAL; variance = HUGE_VAL; } else { distance = -log(x) * K/(K-1); variance = exp( 2.*K*distance/(K-1) ) * D * (1.-D) / N; } if (opt_distance != NULL) *opt_distance = distance; if (opt_variance != NULL) *opt_variance = variance; return eslOK; ERROR: if (opt_distance != NULL) *opt_distance = HUGE_VAL; if (opt_variance != NULL) *opt_variance = HUGE_VAL; return status; }
/* Function: p7_masstrace_FinishCount() * Synopsis: Convert counted histograms to cumulative endpoint prob distributions. * * Purpose: We've finished collecting endpoints from traces with * <_CountTrace()> in <mt>, <ntr> of which had the * specified domain anchor; now convert the counts to * <mt>'s cumulative probability distributions. * * Args: mt - mass trace object we've collected endpoint counts in * ntr - number of traces we counted into <mt> that contained the domain anchor * * Returns: <eslOK> on success; <mt> is now a valid <P7_MASSTRACE> object * containing envelope endpoint cumulative probability distributions. */ int p7_masstrace_FinishCount(P7_MASSTRACE *mt, int ntr) { int i,k; ESL_DASSERT1( (ntr > 0) ); ESL_DASSERT1( (mt->i0) ); ESL_DASSERT1( (mt->k0) ); ESL_DASSERT1( (p7_trace_IsMain(mt->st0)) ); if (mt->imass) { for (i = 1; i < mt->i0; i++) mt->imass[i] += mt->imass[i-1]; for (i = mt->L; i > mt->i0; i--) mt->imass[i] += mt->imass[i+1]; esl_vec_FScale(mt->imass+1, mt->L, 1./(float) ntr); mt->imass[mt->i0] = 1.; } for (k = 1; k < mt->k0; k++) mt->kmass[k] += mt->kmass[k-1]; for (k = mt->M; k > mt->k0; k--) mt->kmass[k] += mt->kmass[k+1]; esl_vec_FScale(mt->kmass+1, mt->M, 1./(float) ntr); mt->kmass[mt->k0] = 1.; return eslOK; }
/* Function: esl_msaweight_BLOSUM() * Synopsis: BLOSUM weights. * Incept: SRE, Sun Nov 5 09:52:41 2006 [Janelia] * * Purpose: Given a multiple sequence alignment <msa> and an identity * threshold <maxid>, calculate sequence weights using the * BLOSUM algorithm (Henikoff and Henikoff, PNAS * 89:10915-10919, 1992). These weights are stored * internally in the <msa> object, replacing any weights * that may have already been there. Weights are $\geq 0$ * and they sum to <msa->nseq>. * * The algorithm does a single linkage clustering by * fractional id, defines clusters such that no two clusters * have a pairwise link $\geq$ <maxid>), and assigns * weights of $\frac{1}{M_i}$ to each of the $M_i$ * sequences in each cluster $i$. The <maxid> threshold * is a fractional pairwise identity, in the range * $0..1$. * * The <msa> may be in either digitized or text mode. * Digital mode is preferred, so that the pairwise identity * calculations deal with degenerate residue symbols * properly. * * Returns: <eslOK> on success, and the weights inside <msa> have been * modified. * * Throws: <eslEMEM> on allocation error. <eslEINVAL> if a pairwise * identity calculation fails because of corrupted sequence * data. In either case, the <msa> is unmodified. * * Xref: [Henikoff92]; squid::weight.c::BlosumWeights(). */ int esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid) { int *c = NULL; /* cluster assignments for each sequence */ int *nmem = NULL; /* number of seqs in each cluster */ int nc; /* number of clusters */ int i; /* loop counter */ int status; /* Contract checks */ ESL_DASSERT1( (maxid >= 0. && maxid <= 1.) ); ESL_DASSERT1( (msa->nseq >= 1) ); ESL_DASSERT1( (msa->alen >= 1) ); if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; } if ((status = esl_msacluster_SingleLinkage(msa, maxid, &c, NULL, &nc)) != eslOK) goto ERROR; ESL_ALLOC(nmem, sizeof(int) * nc); esl_vec_ISet(nmem, nc, 0); for (i = 0; i < msa->nseq; i++) nmem[c[i]]++; for (i = 0; i < msa->nseq; i++) msa->wgt[i] = 1. / (double) nmem[c[i]]; /* Make weights normalize up to nseq, and return. */ esl_vec_DNorm(msa->wgt, msa->nseq); esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq); msa->flags |= eslMSA_HASWGTS; free(nmem); free(c); return eslOK; ERROR: if (c != NULL) free(c); if (nmem != NULL) free(nmem); return status; }
static int do_by_windows(ESL_GENCODE *gcode, ESL_GENCODE_WORKSTATE *wrk, ESL_SQFILE *sqfp) { ESL_SQ *sq = esl_sq_CreateDigital(gcode->nt_abc); int windowsize = 4092; // can be any value, but a multiple of 3 makes most sense. windowsize can be +/-; + means reading top strand; - means bottom strand. int contextsize = 2; // contextsize (adjacent window overlap) must be 2, or translation won't work properly. int wstatus; ESL_DASSERT1(( windowsize % 3 == 0 )); while (( wstatus = esl_sqio_ReadWindow(sqfp, contextsize, windowsize, sq)) != eslEOF) { if (wstatus == eslEOD) { if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick)) esl_gencode_ProcessEnd(wrk, sq); if (windowsize > 0 && ! wrk->do_crick) { esl_sq_Reuse(sq); continue; } // Don't switch to revcomp if we don't need do. Allows -W --watson to work on nonrewindable streams if (windowsize < 0) esl_sq_Reuse(sq); // Do not Reuse the sq on the switch from watson to crick; ReadWindow needs sq->L windowsize = -windowsize; // switch to other strand. continue; } else if (wstatus == eslEFORMAT) esl_fatal("Parsing failed in sequence file %s:\n%s", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (wstatus == eslEINVAL) esl_fatal("Invalid residue(s) found in sequence file %s\n%s", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp)); else if (wstatus != eslOK) esl_fatal("Unexpected error %d reading sequence file %s", wstatus, sqfp->filename); /* If we're the first window in this input DNA sequence * (or the first window in its revcomp), then initialize. * sq->C is the actual context overlap; 0=1st window; 2 (i.e. C)= subsequent. */ if (sq->C == 0) { if (sq->n < 3) continue; // DNA sequence too short; skip it, don't even bother to revcomp, go to next sequence. if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick)) esl_gencode_ProcessStart(gcode, wrk, sq); } if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick)) esl_gencode_ProcessPiece(gcode, wrk, sq); } esl_sq_Destroy(sq); return eslOK; }
/* Function: p7_filtermx_Create() * Synopsis: Create a one-row DP matrix for MSV, VF. * * Purpose: Allocate a reusable, resizeable one-row <P7_FILTERMX> * suitable for MSV and Viterbi filter calculations on * query profiles of up to <allocM> consensus positions. * * <allocM> must be $\leq$ 100,000. This is an H3 design * limit. * * Args: allocM - initial allocation size, in profile positions. (>=1, <=100000) * * Returns: ptr to new <P7_FILTERMX> * * Throws: <NULL> on allocation failure. */ P7_FILTERMX * p7_filtermx_Create_neon64(int allocM) { #ifdef HAVE_NEON64 P7_FILTERMX *fx = NULL; int status; /* Contract checks / argument validation */ ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) ); ESL_ALLOC(fx, sizeof(P7_FILTERMX)); fx->simd = NEON64; fx->M = 0; fx->dp = NULL; fx->dp_mem = NULL; fx->allocM = 0; fx->type = p7F_NONE; #ifdef p7_DEBUGGING fx->do_dumping= FALSE; fx->dfp = NULL; #endif // ISA we're using /* 16B per vector * (MDI)states * ~M/4 vectors + alignment slop */ ESL_ALLOC(fx->dp_mem, (sizeof(esl_neon_128i_t) * p7F_NSCELLS * P7_NVW(allocM)) + (p7_VALIGN-1)); fx->allocM = allocM; /* Manual memory alignment incantation: */ fx->dp = (esl_neon_128i_t *) ( (unsigned long int) ( (char *) fx->dp_mem + (p7_VALIGN-1) ) & p7_VALIMASK); return fx; ERROR: p7_filtermx_Destroy(fx); return NULL; #endif //HAVE_NEON64 #ifndef HAVE_NEON64 return NULL; #endif }
/* Function: p7_filtermx_Create() * Synopsis: Create a one-row DP matrix for MSV, VF. * * Purpose: Allocate a reusable, resizeable one-row <P7_FILTERMX> * suitable for MSV and Viterbi filter calculations on * query profiles of up to <allocM> consensus positions. * * <allocM> must be $\leq$ 100,000. This is an H3 design * limit. * * Args: allocM - initial allocation size, in profile positions. (>=1, <=100000) * * Returns: ptr to new <P7_FILTERMX> * * Throws: <NULL> on allocation failure. */ P7_FILTERMX * p7_filtermx_Create_avx(int allocM) { P7_FILTERMX *fx = NULL; int status; /* Contract checks / argument validation */ ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) ); #ifdef HAVE_AVX2 ESL_ALLOC(fx, sizeof(P7_FILTERMX)); fx->simd = AVX; fx->dp_AVX = NULL; fx->dp_mem_AVX = NULL; fx->allocM_AVX = 0; fx->type = p7F_NONE; #ifdef p7_DEBUGGING fx->do_dumping= FALSE; fx->dfp = NULL; #endif /* 32B per vector * (MDI)states * ~M/4 vectors + alignment slop */ ESL_ALLOC(fx->dp_mem_AVX, (sizeof(__m256i) * p7F_NSCELLS * P7_NVW_AVX(allocM)) + (p7_VALIGN_AVX-1)); fx->allocM_AVX = allocM; /* Manual memory alignment incantation: */ fx->dp_AVX = (__m256i *) ( (unsigned long int) ( (char *) fx->dp_mem_AVX + (p7_VALIGN_AVX-1) ) & p7_VALIMASK_AVX); return fx; ERROR: p7_filtermx_Destroy(fx); return NULL; #endif //HAVE_AVX2 #ifndef HAVE_AVX2 return NULL; #endif }
/* wei_binned_func(): * Returns the negative log likelihood of a binned data sample, * in the API of the conjugate gradient descent optimizer in esl_minimizer. */ static double wei_binned_func(double *p, int nparam, void *dptr) { struct wei_binned_data *data = (struct wei_binned_data *) dptr; ESL_HISTOGRAM *h = data->h; double lambda, tau; double logL; double ai,bi; int i; double tmp; /* Unpack what the optimizer gave us. */ lambda = exp(p[0]); /* see below for c.o.v. notes */ tau = exp(p[1]); logL = 0.; for (i = h->cmin; i <= h->imax; i++) { if (h->obs[i] == 0) continue; ai = esl_histogram_Bin2LBound(h,i); bi = esl_histogram_Bin2UBound(h,i); if (ai < data->mu) ai = data->mu; tmp = esl_wei_cdf(bi, data->mu, lambda, tau) - esl_wei_cdf(ai, data->mu, lambda, tau); /* for cdf~1.0, numerical roundoff error can create tmp<0 by a * teensy amount; tolerate that, but catch anything worse */ ESL_DASSERT1( (tmp + 1e-7 > 0.)); if (tmp <= 0.) return eslINFINITY; logL += h->obs[i] * log(tmp); } return -logL; /* goal: minimize NLL */ }
/* Function: esl_mixgev_FitComplete() * * Purpose: Given <n> observed data values <x[0..n-1]>, and * an initial guess at a mixture GEV fit to those data * <mg>, use conjugate gradient descent to perform * a locally optimal maximum likelihood mixture * GEV parameter fit to the data. * * To obtain a reasonable initial guess for <mg>, * see <esl_mixgev_FitGuess()>. * * Args: x - observed data, <x[0..n-1]>. * n - number of samples in <x> * mg - mixture GEV to estimate, w/ params set to * an initial guess. * * Returns: <eslOK> on success, and <mg> contains local * ML estimate for mixture GEV parameters. * * Throws: <eslEMEM> on allocation error, and <mg> is unchanged * from its initial state. */ int esl_mixgev_FitComplete(double *x, int n, ESL_MIXGEV *mg) { struct mixgev_data data; int status; double *p = NULL; double *u = NULL; double *wrk = NULL; double tol; int np; double fx; int k; int i; tol = 1e-6; /* Determine number of free parameters and allocate */ np = mg->K-1; /* K-1 mix coefficients free */ for (k = 0; k < mg->K; k++) np += (mg->isgumbel[k])? 2 : 3; ESL_ALLOC(p, sizeof(double) * np); ESL_ALLOC(u, sizeof(double) * np); ESL_ALLOC(wrk, sizeof(double) * np * 4); /* Copy shared info into the "data" structure */ data.x = x; data.n = n; data.wrk = wrk; data.mg = mg; /* From mg, create the parameter vector. */ mixgev_pack_paramvector(p, np, mg); /* Define the step size vector u. */ i = 0; for (k = 1; k < mg->K; k++) u[i++] = 1.0; for (k = 0; k < mg->K; k++) { u[i++] = 1.0; u[i++] = 1.0; if (! mg->isgumbel[k]) u[i++] = 0.02; } ESL_DASSERT1( (np == i) ); /* Feed it all to the mighty optimizer. */ status = esl_min_ConjugateGradientDescent(p, u, np, &mixgev_complete_func, NULL, (void *) (&data), tol, wrk, &fx); if (status != eslOK) goto ERROR; /* Convert the final parameter vector back to a mixture GEV */ mixgev_unpack_paramvector(p, np, mg); free(p); free(u); free(wrk); return eslOK; ERROR: if (p != NULL) free(p); if (u != NULL) free(u); if (wrk != NULL) free(wrk); return status; }
/* Function: esl_keyhash_CreateCustom() * Synopsis: Allocate a new keyhash with customized initial allocations. * * Purpose: Create a new hash table, initially allocating for * a hash table of size <hashsize> entries, <kalloc> * keys, and a total key string length of <salloc>. * <hashsize> must be a power of 2, and all allocations * must be $\geq 0$. * * The object will still expand as needed, so the reason to * use a customized allocation is when you're trying to * minimize memory footprint and you expect your keyhash to * be smaller than the default (of up to 128 keys, of total * length up to 2048). * * Throws: <NULL> on allocation failure. */ ESL_KEYHASH * esl_keyhash_CreateCustom(uint32_t hashsize, int kalloc, int salloc) { ESL_DASSERT1((hashsize && ((hashsize & (hashsize-1)) == 0))); /* hashsize is a power of 2 (bitshifting trickery) */ return keyhash_create(hashsize, kalloc, salloc); }
static int profillic_esl_msafile_profile_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa, ProfileType * profile_ptr ) { /// \note Right now this isn't actually using the open file pointer; for convenience I just use the profile.fromFile( <filename> ) method. /// \todo Use convenience fns in esl_buffer.h; see eg hmmer-3.1/easel/esl_msafile_stockholm.c for examples... ESL_MSA *msa = NULL; string profile_string; char *buf; long len; int seqidx; int status; char errmsg2[eslERRBUFSIZE]; ESL_DASSERT1((afp->format == eslMSAFILE_PROFILLIC)); const char * const seqname = "Galosh Profile Consensus"; const char * const msaname = "Galosh Profile"; uint32_t profile_length; galosh::Sequence<typename ProfileType::ProfileResidueType> consensus_sequence; stringstream tmp_consensus_output_stream; uint32_t pos_i; if (profile_ptr == NULL) { ESL_EXCEPTION(eslEINCONCEIVABLE, "profile_ptr is NULL in profillic_esl_msafile_profile_Read(..)!"); } //if (feof(afp->bf->fp)) { status = eslEOF; goto ERROR; } afp->errmsg[0] = '\0'; // Read in the galosh profile (from profillic) //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end //len = afp->bf->ftell( afp->bf->fp ); // get the position at the end (length) //fseek( afp->bf->fp, 0, SEEK_SET ); // go to the beginning again. //ESL_ALLOC_CPP( char, buf, sizeof( char ) * len ); //malloc buffer //fread( buf, len, 1, afp->bf->fp ); //read into buffer //profile_string = buf; //profile_ptr->fromString( profile_string ); profile_ptr->fromFile( afp->bf->filename ); //if (buf) free(buf); // \todo WHY WON'T THIS WORK? See HACKs in profillic-hmmbuild.cpp to work around it. //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end (to signal there's no more profiles in the file, the next time we come to this function) // Calculate the consensus sequence. profile_length = profile_ptr->length(); consensus_sequence.reinitialize( profile_length ); for( pos_i = 0; pos_i < profile_length; pos_i++ ) { consensus_sequence[ pos_i ] = ( *profile_ptr )[ pos_i ][ galosh::Emission::Match ].maximumValueType(); } tmp_consensus_output_stream << consensus_sequence; /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */ #ifdef eslAUGMENT_ALPHABET if (afp->abc && (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } #endif if (! afp->abc && (msa = esl_msa_Create( 16, -1)) == NULL) { status = eslEMEM; goto ERROR; } // Set first-and-only seq to the consensus. This should set sqlen[0] to the profile's length and set ax to have length 1 and ax[0] to be the sequence itself. Also msa->sqname[0] to the "name" of that consensus sequence. /* if nec, make room for the new seq */ if (msa->nseq >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) return status; seqidx = msa->nseq; // 0 msa->nseq++; // = 1 status = esl_strdup(seqname, -1, &(msa->sqname[seqidx])); // NOTE: Could add description of this "sequence" here, using esl_msa_SetSeqDescription(msa, seqidx, desc). #ifdef eslAUGMENT_ALPHABET if (msa->flags & eslMSA_DIGITAL) { // NOTE (profillic): There was a bug in this; it had said .."esl_abc_dsqcat(msa->abc, " where it should have said .."esl_abc_dsqcat(msa->abc->inmap, " if((status = esl_abc_dsqcat(msa->abc->inmap, &(msa->ax[seqidx]), &(msa->sqlen[seqidx]), tmp_consensus_output_stream.str().c_str(), profile_length)) != eslOK) { /* invalid char(s), get informative error message */ if (esl_abc_ValidateSeq(msa->abc, tmp_consensus_output_stream.str().c_str(), profile_length, afp->errmsg) != eslOK) ESL_XFAIL(eslEFORMAT, errmsg2, "%s (line %d): %s", msa->sqname[0], afp->linenumber, afp->errmsg); } } #endif if (! (msa->flags & eslMSA_DIGITAL)) { status = esl_strcat(&(msa->aseq[seqidx]), 0, tmp_consensus_output_stream.str().c_str(), profile_length); msa->sqlen[seqidx] = profile_length; } msa->alen = profile_length; /// \todo OR read in a fasta file of sequences too. /// \todo (Optional?) Set msa->name to the name of the profile (file?) esl_strdup(msaname, -1, &(msa->name)); /// \todo make sure eslMSA_HASWGTS is FALSE .. OR set it to TRUE and set msa->wgt[idx] to 1.0. /// \note Could have secondary structure (per sequence) too. msa->ss[0]. msa->sslen[0] should be the same as msa->sqlen[0]. /// \todo Investigate what msa->sa and msa->pp are for. /* Give the newly parsed MSA a good * going-over, and finalize the fields of the MSA data structure. * verify_parse will fill in errmsg if it sees a problem. */ //if (verify_parse(msa, afp->errmsg) != eslOK) { status = eslEFORMAT; goto ERROR; } if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR; if (ret_msa != NULL) *ret_msa = msa; else esl_msa_Destroy(msa); return eslOK; ERROR: if (msa != NULL) esl_msa_Destroy(msa); if (ret_msa != NULL) *ret_msa = NULL; return status; }