示例#1
0
/* Function:  p7_anchors_SampleFromTrace()
 * Synopsis:  Make a reasonable anchor set from a trace.
 *
 * Purpose:   Make a reasonable anchor set from trace <tr>, by
 *            randomly sampling a match state in each domain.
 *            Return the anchor set in <anch>, which will be
 *            reallocated if needed.
 *
 *            <tr> must be indexed by the caller with <p7_trace_Index()>.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on reallocation failure.
 */
int
p7_anchors_SampleFromTrace(P7_ANCHORS *anch, ESL_RANDOMNESS *rng, const P7_TRACE *tr)
{
    int D = tr->ndom;
    int d,z,w;
    int nM;
    int status;

    if ((status = p7_anchors_Resize(anch, D)) != eslOK) goto ERROR;

    for (d = 1; d <= D; d++)
    {
        for (nM = 0, z = tr->tfrom[d-1]; z <= tr->tto[d-1]; z++)   // P7_TRACE numbers domains 0..D-1, off by one from P7_ANCHORS
            if (p7_trace_IsM(tr->st[z])) nM++;
        ESL_DASSERT1(( nM ));

        w = 1+esl_rnd_Roll(rng, nM);               // w = 1..nM : choice of which M state to make the anchor

        for ( z = tr->tfrom[d-1]; w; z++)          // when w reaches 0, tr->st[z] is the M state we want to make the anchor, and we break out; there's a final z++, so the state we want ends up being z-1
            if (p7_trace_IsM(tr->st[z])) w--;
        ESL_DASSERT1(( p7_trace_IsM(tr->st[z-1]) )); // since the logic above is overly elegant... better doublecheck.

        anch->a[d].i0 = tr->i[z-1];
        anch->a[d].k0 = tr->k[z-1];
    }

    p7_anchor_SetSentinels(anch->a, D, tr->L, tr->M);
    anch->D = D;
    return eslOK;

ERROR:
    return status;
}
示例#2
0
/* Function:  esl_msaweight_IDFilter()
 * Synopsis:  Filter by %ID.
 * Incept:    ER, Wed Oct 29 10:06:43 2008 [Janelia]
 * 
 * Purpose:   Constructs a new alignment by removing near-identical 
 *            sequences from a given alignment (where identity is 
 *            calculated *based on the alignment*).
 *            Does not affect the given alignment.
 *            Keeps earlier sequence, discards later one. 
 *           
 *            Usually called as an ad hoc sequence "weighting" mechanism.
 *           
 * Limitations:
 *            Unparsed Stockholm markup is not propagated into the
 *            new alignment.
 *           
 * Return:    <eslOK> on success, and the <newmsa>.
 *
 * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
 *            identity calculation fails because of corrupted sequence 
 *            data. In either case, the <msa> is unmodified.
 *
 * Xref:      squid::weight.c::FilterAlignment().
 */
int
esl_msaweight_IDFilter(const ESL_MSA *msa, double maxid, ESL_MSA **ret_newmsa)
{
  int     *list   = NULL;               /* array of seqs in new msa */
  int     *useme  = NULL;               /* TRUE if seq is kept in new msa */
  int      nnew;			/* number of seqs in new alignment */
  double   ident;                       /* pairwise percentage id */
  int      i,j;                         /* seqs counters*/
  int      remove;                      /* TRUE if sq is to be removed */
  int      status;
  
  /* Contract checks
   */
  ESL_DASSERT1( (msa       != NULL) );
  ESL_DASSERT1( (msa->nseq >= 1)    );
  ESL_DASSERT1( (msa->alen >= 1)    );

  /* allocate */
  ESL_ALLOC(list,  sizeof(int) * msa->nseq);
  ESL_ALLOC(useme, sizeof(int) * msa->nseq);
  esl_vec_ISet(useme, msa->nseq, 0); /* initialize array */

  /* find which seqs to keep (list) */
  nnew = 0;
  for (i = 0; i < msa->nseq; i++)
    {
      remove = FALSE;
      for (j = 0; j < nnew; j++)
	{
	  if (! (msa->flags & eslMSA_DIGITAL)) {
	    if ((status = esl_dst_CPairId(msa->aseq[i], msa->aseq[list[j]], &ident, NULL, NULL))       != eslOK) goto ERROR;
	  } 
#ifdef eslAUGMENT_ALPHABET
	  else {
	    if ((status = esl_dst_XPairId(msa->abc, msa->ax[i], msa->ax[list[j]], &ident, NULL, NULL)) != eslOK) goto ERROR;
	  }
#endif
	  
	  if (ident > maxid)
	    { 
	      remove = TRUE; 
	      break; 
	    }
	}
      if (remove == FALSE) {
	list[nnew++] = i;
	useme[i]     = TRUE;
      }
    }
  if ((status = esl_msa_SequenceSubset(msa, useme, ret_newmsa)) != eslOK) goto ERROR;
 
  free(list);
  free(useme);
  return eslOK;

 ERROR:
  if (list  != NULL) free(list);
  if (useme != NULL) free(useme);
  return status;
}
示例#3
0
static double 
sxp_complete_binned_func(double *p, int np, void *dptr)
{
  struct sxp_binned_data *data = (struct sxp_binned_data *) dptr;
  ESL_HISTOGRAM          *g    = data->g;
  double logL = 0.;
  double ai, bi;		/* lower, upper bounds on bin */
  double lambda, tau;
  int    i;
  double tmp;

  lambda = exp(p[0]);
  tau    = exp(p[1]);  

  ESL_DASSERT1(( ! isnan(lambda) ));
  ESL_DASSERT1(( ! isnan(tau) ));
  
  for (i = g->cmin; i <= g->imax; i++) /* for each occupied bin */
    {
      if (g->obs[i] == 0) continue;
      
      ai = esl_histogram_Bin2LBound(g, i);
      bi = esl_histogram_Bin2UBound(g, i);
      if (ai < data->mu) ai = data->mu; /* careful at leftmost bound */

      tmp = esl_sxp_cdf(bi, data->mu, lambda, tau) -
            esl_sxp_cdf(ai, data->mu, lambda, tau);
      if      (tmp == 0.) return eslINFINITY;
      logL += g->obs[i] * log(tmp);
    }
  return -logL;			/* minimizing NLL */
}
示例#4
0
int
p7_mpas_stats_CompareAS2Trace(P7_MPAS_STATS *stats, const P7_ANCHORS *anch, const P7_TRACE *tr)
{
  int ad;
  int td              = 0;
  int anch_in_this_td = 0;

  stats->anch_outside    = 0;
  stats->anch_unique     = 0;
  stats->anch_multiple   = 0;

  stats->dom_zero        = 0;
  stats->dom_one         = 0;
  stats->dom_multiple    = 0;

  /* For n domains in tr:
   *   they can either be hit 0 times, 1 time, or 2+ times by anchors.
   * For m anchors in anch:
   *   they can either fall outside any domain, uniquely in a domain, or multiply in a domain.
   *   
   * Watch out: ad (in anchor set) is 1..D; td (in trace) is 0..D-1.  
   */
  for (ad = 1; ad <= anch->D; ad++)
    {
      if   (anch->a[ad].i0 < tr->sqfrom[td] || td == tr->ndom) 
	stats->anch_outside++;
      else if (anch->a[ad].i0 >= tr->sqfrom[td] && anch->a[ad].i0 <= tr->sqto[td])
	anch_in_this_td++;
      else 
	{
	  /* we have to advance <td>, and try again */
	  if      (anch_in_this_td == 0) { stats->dom_zero++; }
	  else if (anch_in_this_td == 1) { stats->anch_unique++; stats->dom_one++; }
	  else if (anch_in_this_td > 1)  { stats->anch_multiple += anch_in_this_td; stats->dom_multiple++; }
	  anch_in_this_td = 0;
	  td++;
	  ad--;			/* forces reevaluation of <ad> when we go back around; a bit hacky! */
	}
    }
  
  /* we're out of anchors. If td == tr->ndom, we also know we
   * handled what happened with anchors in the last <td>. But if
   * td == tr->ndom-1, we haven't yet resolved what happened with final <td> yet,
   * and if td is even smaller, we have some dom_zero's to count.
   */
  for (; td < tr->ndom; td++)
    {
      if      (anch_in_this_td == 0) { stats->dom_zero++; }
      else if (anch_in_this_td == 1) { stats->anch_unique++; stats->dom_one++; }
      else if (anch_in_this_td > 1)  { stats->anch_multiple += anch_in_this_td; stats->dom_multiple++; }
      anch_in_this_td = 0;
    }

  ESL_DASSERT1(( stats->dom_zero     + stats->dom_one     + stats->dom_multiple  == tr->ndom ));
  ESL_DASSERT1(( stats->anch_outside + stats->anch_unique + stats->anch_multiple == anch->D  ));

  stats->has_part2 = TRUE;
  return eslOK;
}
示例#5
0
/* Function:  p7_masstrace_CountTrace()
 * Synopsis:  Count domain endpoints into endpoint distributions.
 *
 * Purpose:   Given a traceback <tr>, determine if it contains a domain
 *            specified by the anchor point <i0>, <k0>, <st0>. If it
 *            doesn't, return without doing anything. If it does,
 *            count that domain's start/end points into the <mt>
 *            structure, and update the <*ntr> count by one.
 *            
 *            This function is useful in unit tests that approximate
 *            the mass trace calculation using a large ensemble of
 *            stochastic tracebacks.
 *            
 *            Before the first <_CountTrace()> call on an <mt>, you
 *            call <p7_masstrace_Zero()> on it to initialize it.
 *            
 *            The counts in <mt> are collected as a histogram; after
 *            an entire ensemble has been collected, <mt> needs to be
 *            converted to a cumulative distribution.
 *            
 *            A special case arises exactly at the 'midpoints' in the
 *            <mt> vectors <kmass> and <imass>. <kmass[k]> will be the
 *            start point cumulative distribution P(ka <= k) for
 *            k<=k0, and the end point cumulative distribution P(kb >=
 *            k) for k>=k0. In a cumulative distributoin, it's ok that
 *            kmass[k0] is defined as both the start and end value,
 *            because it's 1.0 in both cases. But in a histogram, we
 *            would have to distinguish whether kmass[k0] has seen a
 *            start ka versus an end kb. Instead of doing something
 *            special to handle this, instead we don't count kmass[k0]
 *            (or imass[i0]) at all; and when we convert to a
 *            cumulative distribution, we'll set these to 1.0.
 *        
 *            Because this is counting in single-precision floating
 *            point arithmetic, it can't accurately count an ensemble
 *            of more than about $10^7$ traces.
 *
 * Args:      tr  - trace structure 
 *            i0  - i sequence position coord of domain anchor
 *            k0  - k model position coord of domain anchor
 *            st0 - a main model {MID}{LG} state type of domain anchor
 *            mt  - mass trace object to count endpoints in
 *            ntr - updated number of traces that contain the anchor
 *
 * Returns:   <eslOK> on success. "Success" includes ignoring a trace
 *            that does not contain the anchor <i0>,<k0>,<st0>. If the
 *            trace does contain the anchor, start/endpoint counts in
 *            <mt> are incremented by one, and <*ntr> is incremented
 *            by one.
 */
int
p7_masstrace_CountTrace(const P7_TRACE *tr, int i0, int k0, int st0, P7_MASSTRACE *mt, int *ntr)
{
  int i,z0,z;
  int ia,ib, ka,kb;
  int foundit = FALSE;

  /* Contract checks on arguments */
  ESL_DASSERT1( ( i0>=1 && i0 <= tr->L) );
  ESL_DASSERT1( ( k0>=1 && k0 <= tr->M) );
  ESL_DASSERT1( ( p7_trace_IsMain(st0)) );
  ESL_DASSERT1( ( mt->i0  == 0 || mt->i0  == i0) );
  ESL_DASSERT1( ( mt->k0  == 0 || mt->k0  == k0) );
  ESL_DASSERT1( ( mt->st0 == 0 || mt->st0 == st0) );
  ESL_DASSERT1( ( mt->L  == tr->L) );
  ESL_DASSERT1( ( mt->M  == tr->M) );

  /* Find the anchor, if it's there. */
  for (i=0, z0 = 0; z0 < tr->N; z0++)
    {
      if (tr->i[z0]) i = tr->i[z0]; /* update i. only emitting states have tr->i[z] set */
      if (i > i0 )  break;	  /* failed to find anchor. */
      if (i == i0 && tr->st[z0] == st0 && tr->k[z0] == k0) { foundit = TRUE; break; }
    }
  if (! foundit) return eslOK;	/* If no anchor: successful return, ignoring this trace. */

  /* Find leftmost bounds of domain */
  for (ia = i0, ka = k0, z = z0; z >= 0 && tr->st[z] != p7T_B; z--) 
    {
      if (tr->i[z]) ia = tr->i[z];
      if (tr->k[z]) ka = tr->k[z];
    }
  ESL_DASSERT1( ( tr->st[z] == p7T_B) );

  /* Find rightmost bounds of domain */
  for (ib = i0, kb = k0, z = z0; z < tr->N && tr->st[z] != p7T_E; z++)
    {
      if (tr->i[z]) ib = tr->i[z];
      if (tr->k[z]) kb = tr->k[z];
    }
  ESL_DASSERT1( ( tr->st[z] == p7T_E) );

  /* Increment counters */
  if (ka < k0)              mt->kmass[ka] += 1.; /* note the guards against incrementing the overlapped start/end at k0,i0 */
  if (kb > k0)              mt->kmass[kb] += 1.;
  if (mt->imass && ia < i0) mt->imass[ia] += 1.; /* also, guard for the optional <imass> data in <mt> */
  if (mt->imass && ib > i0) mt->imass[ib] += 1.;
  *ntr += 1;

  /* Make sure i0,k0,st0 are set. */
  mt->i0  = i0;
  mt->k0  = k0;
  mt->st0 = st0;
  return eslOK;
}
示例#6
0
/* guaranteed s1 >= -INFTY, p2 >= -INFTY */
int
ILogsumNI(int s1, int s2)
{
  ESL_DASSERT1((s1 >= -INFTY));
  ESL_DASSERT1((s2 >= -INFTY));
  const int diff = s1-s2;
  if      (diff >=  LOGSUM_TBL) return s1;
  else if (diff <= -LOGSUM_TBL) return s2;
  else if (diff > 0)            return s1 + ilogsum_lookup[diff];
  else                          return s2 + ilogsum_lookup[-diff];
} 
示例#7
0
static int
a2m_padding_digital(ESL_MSA *msa, char **csflag, int *nins, int ncons)
{
  ESL_DSQ *ax     = NULL;		/* new aligned sequence - will be swapped into msa->ax[] */
  ESL_DSQ  gapsym = esl_abc_XGetGap(msa->abc);
  int      apos, cpos, spos;	/* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */
  int      alen;
  int      icount;
  int      idx;
  int      status;

  alen = ncons;
  for (cpos = 0; cpos <= ncons; cpos++)
    alen += nins[cpos];

  ESL_ALLOC(msa->rf, sizeof(char) * (alen+1));
  for (apos = 0, cpos = 0; cpos <= ncons; cpos++)
    {
      for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.';
      if  (cpos < ncons) msa->rf[apos++] = 'x';
    }
  msa->rf[apos] = '\0';

  for (idx = 0; idx < msa->nseq; idx++)
    {
      ESL_ALLOC(ax, sizeof(ESL_DSQ) * (alen + 2));    
      ax[0] = eslDSQ_SENTINEL;
      apos = spos  = 0; 
      for (cpos = 0; cpos <= ncons; cpos++)
	{
	  icount = 0;   
	  while (csflag[idx][spos] == FALSE)  { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++; icount++; }
	  while (icount < nins[cpos]) 	      { ax[apos+1] = gapsym;               apos++;         icount++; }
	  if (cpos < ncons)                   { ax[apos+1] = msa->ax[idx][spos+1]; apos++; spos++;           }
	}
      ESL_DASSERT1( (msa->ax[idx][spos+1] == eslDSQ_SENTINEL) );
      ESL_DASSERT1( (apos == alen) );
      ax[alen+1] = eslDSQ_SENTINEL;
      free(msa->ax[idx]);
      msa->ax[idx] = ax;
      ax = NULL;
    }
  msa->alen = alen;



  return eslOK;
  
 ERROR:
  if (ax) free(ax);
  return status;
}
示例#8
0
/* Function:  p7_masstrace_Zero()
 * Synopsis:  Initialize cumulative endpoint distributions to zeros.
 *
 * Purpose:   Zero the cumulative distributions in <mt>, preparing to
 *            collect masstrace endpoint data for a sequence of length
 *            <L> and a profile of length <M>.
 *
 * Args:      mt - mass trace object to collect endpoint data in
 *            M  - profile length
 *            L  - sequence length
 *
 * Returns:   <eslOK> on success.
 */
int
p7_masstrace_Zero(P7_MASSTRACE *mt, int M, int L)
{
  /* contract checks / argument validation */
  ESL_DASSERT1( (mt->imass == NULL || L+2 <= mt->ialloc ) );
  ESL_DASSERT1( (M+2 <= mt->kalloc) );

  if (mt->imass) esl_vec_FSet(mt->imass, L+2, 0.0f);
  esl_vec_FSet(mt->kmass, M+2, 0.0f);
  mt->L = L;
  mt->M = M;
  return eslOK;
}
示例#9
0
/* Return the negative gradient at a point, determined 
 * numerically.
 */
static void
numeric_derivative(double *x, double *u, int n, 
		   double (*func)(double *, int, void*),
		   void *prm, double relstep,
		   double *dx)
{
  int    i;
  double delta;
  double f1, f2;
  double tmp;

  for (i = 0; i < n; i++)
    {
      delta = fabs(u[i] * relstep);

      tmp = x[i]; 
      x[i] = tmp + delta;
      f1  = (*func)(x, n, prm);
      x[i] = tmp - delta;
      f2  = (*func)(x, n, prm);
      x[i] = tmp;

      dx[i] = (-0.5 * (f1-f2)) / delta;

      ESL_DASSERT1((! isnan(dx[i])));
    }
}
示例#10
0
/* Using FChoose() here would mean allocating tmp space for 2M-1 paths;
 * instead we use the fact that E(i) is itself the necessary normalization
 * factor, and implement FChoose's algorithm here for an on-the-fly 
 * calculation.
 * Note that that means double-precision calculation, to be sure 0.0 <= roll < 1.0
 */
static inline int
select_e(ESL_RANDOMNESS *rng, const P7_OPROFILE *om, const P7_OMX *ox, int i, int *ret_k)
{
  int    Q     = p7O_NQF(ox->M);
  double sum   = 0.0;
  double roll  = esl_random(rng);
  double norm  = 1.0 / ox->xmx[i*p7X_NXCELLS+p7X_E];
  __m128 xEv   = _mm_set1_ps(norm); /* all M, D already scaled exactly the same */
  union { __m128 v; float p[4]; } u;
  int    q,r;

  while (1) {
    for (q = 0; q < Q; q++)
      {
	u.v = _mm_mul_ps(ox->dpf[i][q*3 + p7X_M], xEv);
	for (r = 0; r < 4; r++) {
	  sum += u.p[r];
	  if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_M;}
	}

	u.v = _mm_mul_ps(ox->dpf[i][q*3 + p7X_D], xEv);
	for (r = 0; r < 4; r++) {
	  sum += u.p[r];
	  if (roll < sum) { *ret_k = r*Q + q + 1; return p7T_D;}
	}
      }
    ESL_DASSERT1((sum > 0.99));
  }
  /*UNREACHED*/
  ESL_EXCEPTION(-1, "unreached code was reached. universe collapses.");
} 
示例#11
0
/* Function:  p7_coords2_hash_Create()
 * Synopsis:  Create a <P7_COORDS2_HASH>
 *
 * Purpose:   Allocate and initialize a <P7_COORDS2_HASH> hash table for storing
 *            lots of coord2 arrays (i.e. domain annotations).
 * 
 *            The <init_*> arguments let you set non-default initial
 *            allocation sizes. To use the default for any of these,
 *            pass a 0 value. Defaults are 128 for the initial 
 *            hashtable size <init_hashsize>; 128 for the initial
 *            allocation for number of keys to be stored <init_nkeyalloc>;
 *            and 2048 for the initial allocation for the number
 *            of integers to be stored in key data. 
 *            
 *            In general the initialization defaults should be
 *            fine. All three are grown automatically as needed, as
 *            you add keys to the hash.
 *            
 *            "key data" means <n> <start>/<end> pairs, plus <n>
 *            itself: it takes 2n+1 integers to store a <P7_COORD2>
 *            array of length <n>.
 *            
 *            <hashsize> must be a power of 2; remember that if you
 *            pass a non-default value.
 *            
 * Args:      init_hashsize : initial hashtable size. Power of 2; >0.
 *            init_keyalloc : initial allocation for # keys. >0.
 *            init_calloc   : initial allocation for key data. >0.
 *
 * Returns:   pointer to the new <P7_COORDS2_HASH> object on success.
 *
 * Throws:    <NULL> on allocation failure.
 */
P7_COORDS2_HASH *
p7_coords2_hash_Create(int32_t init_hashsize, int32_t init_nkeyalloc, int32_t init_calloc)
{
  P7_COORDS2_HASH *ch = NULL;
  int32_t          i;
  int              status;

  ESL_DASSERT1(( init_hashsize == 0 || (init_hashsize && ((init_hashsize & (init_hashsize-1)) == 0)))); /* hashsize is a power of 2 (bitshifting trickery) */
  
  ESL_ALLOC(ch, sizeof(P7_COORDS2_HASH));
  ch->hashtable  = NULL;
  ch->key_offset = NULL;
  ch->nxt        = NULL;
  ch->cmem       = NULL;

  ch->nkeys      = 0;
  ch->cn         = 0;

  ch->hashsize   = (init_hashsize  > 0 ? init_hashsize  : 128);
  ch->kalloc     = (init_nkeyalloc > 0 ? init_nkeyalloc : 128);
  ch->calloc     = (init_calloc    > 0 ? init_calloc    : 2048);
  
  ESL_ALLOC(ch->hashtable, sizeof(int32_t) * ch->hashsize);
  for (i = 0; i < ch->hashsize; i++) ch->hashtable[i] = -1;

  ESL_ALLOC(ch->key_offset, sizeof(int32_t) * ch->kalloc);
  ESL_ALLOC(ch->nxt,        sizeof(int32_t) * ch->kalloc);
  ESL_ALLOC(ch->cmem,       sizeof(int32_t) * ch->calloc);
  return ch;
  
 ERROR:
  p7_coords2_hash_Destroy(ch);
  return NULL;
}
示例#12
0
/* Function:  p7_anchors_Resize()
 * Synopsis:  Reallocate a P7_ANCHORS object, if necessary
 *
 * Purpose:   Make sure that <anch> can hold an array of
 *            at least <D> anchors.
 *
 *            Does not alter any data that are already stored
 *            in <anch>, so it's safe to resize an anchor
 *            array that we're growing incrementally (as in
 *            segmental divide and conquer MPAS algorithm).
 *
 *            D=0 is a valid argument and may occur in normal use; it
 *            results in a no-op, because the structure is always big
 *            enough to hold zero anchors.
 *
 * Xref:      First example of a new pattern for how we
 *            can handle reallocation/reuse strategy,
 *            replacing _Reinit() and _Grow() interfaces.
 *            [SRE:J14/1]
 */
int
p7_anchors_Resize(P7_ANCHORS *anch, int D)
{
    int nalloc;
    int status;

    /* Contract checks, argument validation */
    ESL_DASSERT1(( anch->nalloc > 0 ));

    if      (D+2 <= anch->nalloc) return eslOK;       // If we're big enough already, do nothing;
    else if (D+2 <  anch->nredline || anch->D > 0)    // If we're under the redline max, or if it looks like
    {   //   we're building the anchor array incrementally,
        nalloc = anch->nalloc;                        //   we reallocate by doubling, trying to minimize
        while (nalloc < D+2) nalloc *= 2;             //   the need for more reallocations soon.
    }                                               // If we're over redline AND it looks like we're
    else nalloc = D+2;                                //   starting an empty object, allocate exactly.
    //   Now nalloc will probably not be a multiple of two --
    //   but the next _Reuse() call will pull it back
    //   to the redline, which is.
    ESL_REALLOC(anch->a, sizeof(P7_ANCHOR) * nalloc);
    anch->nalloc = nalloc;
    return eslOK;

ERROR:
    return status;
}
示例#13
0
/* Function:  p7_filtermx_GrowTo()
 * Synopsis:  Resize filter DP matrix for new profile size.
 *
 * Purpose:   Given an existing filter matrix structure <fx>,
 *            and the dimension <M> of a new profile that 
 *            we're going to use (in consensus positions),
 *            assure that <fx> is large enough for such a 
 *            profile; reallocate and reinitialize as needed.
 *
 *            <p7_filtermx_Reuse(fx); p7_filtermx_GrowTo(fx, M)>
 *            is essentially equivalent to <p7_filtermx_Create(M)>,
 *            while minimizing reallocation.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failure. The state of
 *            <fx> is now undefined, and it should not be used.
 */
int
p7_filtermx_GrowTo_avx(P7_FILTERMX *fx, int allocM)
{
  int status;

  /* Contract checks / argument validation */
  ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) );
#ifdef HAVE_AVX2
  /* is it already big enough? */
  if (allocM <= fx->allocM_AVX) return eslOK;

  /* if not, grow it */
  ESL_REALLOC(fx->dp_mem_AVX, (sizeof(__m256i) * (p7F_NSCELLS * P7_NVW_AVX(allocM))) + (p7_VALIGN_AVX-1));
  fx->allocM_AVX = allocM;
  fx->dp_AVX     = (__m256i *) ( (unsigned long int) ( (char *) fx->dp_mem_AVX + (p7_VALIGN_AVX-1)) & p7_VALIMASK_AVX);

  return eslOK;

 ERROR:
  return status;
#endif //HAVE_AVX2
 #ifndef HAVE_AVX2
 return eslENORESULT;
 #endif   
}
示例#14
0
/* Function:  p7_filtermx_GrowTo()
 * Synopsis:  Resize filter DP matrix for new profile size.
 *
 * Purpose:   Given an existing filter matrix structure <fx>,
 *            and the dimension <M> of a new profile that 
 *            we're going to use (in consensus positions),
 *            assure that <fx> is large enough for such a 
 *            profile; reallocate and reinitialize as needed.
 *
 *            <p7_filtermx_Reuse(fx); p7_filtermx_GrowTo(fx, M)>
 *            is essentially equivalent to <p7_filtermx_Create(M)>,
 *            while minimizing reallocation.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failure. The state of
 *            <fx> is now undefined, and it should not be used.
 */
int
p7_filtermx_GrowTo_neon64(P7_FILTERMX *fx, int allocM)
{
#ifdef HAVE_NEON64  
  int status;

  /* Contract checks / argument validation */
  ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) );

  if (allocM <= fx->allocM) return eslOK;

  /* if not, grow it */
  ESL_REALLOC(fx->dp_mem, (sizeof(esl_neon_128i_t) * (p7F_NSCELLS * P7_NVW(allocM))) + (p7_VALIGN-1));
  fx->allocM = allocM;
  fx->dp     = (esl_neon_128i_t *) ( (unsigned long int) ( (char *) fx->dp_mem + (p7_VALIGN-1)) & p7_VALIMASK);

  return eslOK;

 ERROR:
  return status;
#endif //HAVE_NEON64
 #ifndef HAVE_NEON64
 return eslENORESULT;
 #endif   
}
示例#15
0
static int
a2m_padding_text(ESL_MSA *msa, char **csflag, int *nins, int ncons)
{
  char   *aseq = NULL;		/* new aligned sequence - will be swapped into msa->aseq[] */
  int     apos, cpos, spos;	/* position counters for alignment 0..alen, consensus cols 0..cpos-1, sequence position 0..slen-1 */
  int     alen;
  int     icount;
  int     idx;
  int     status;

  alen = ncons;
  for (cpos = 0; cpos <= ncons; cpos++)
    alen += nins[cpos];

  ESL_ALLOC(msa->rf, sizeof(char) * (alen+1));
  for (apos = 0, cpos = 0; cpos <= ncons; cpos++)
    {
      for (icount = 0; icount < nins[cpos]; icount++) msa->rf[apos++] = '.';
      if  (cpos < ncons) msa->rf[apos++] = 'x';
    }
  msa->rf[apos] = '\0';
  
  for (idx = 0; idx < msa->nseq; idx++)
    {
      ESL_ALLOC(aseq, sizeof(char) * (alen + 1));    
      apos = spos  = 0; 
      for (cpos = 0; cpos <= ncons; cpos++)
	{
	  icount = 0;   
	  while (csflag[idx][spos] == FALSE)  { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++; icount++; }
	  while (icount < nins[cpos]) 	      { aseq[apos] = '.';                  apos++;         icount++; }
	  if (cpos < ncons) 	              { aseq[apos] = msa->aseq[idx][spos]; apos++; spos++;           }
	}
      ESL_DASSERT1( (msa->aseq[idx][spos] == '\0') );
      ESL_DASSERT1( (apos == alen) );
      aseq[alen] = '\0';
      free(msa->aseq[idx]);
      msa->aseq[idx] = aseq;
      aseq = NULL;
    }
  msa->alen = alen;
  return eslOK;
  
 ERROR:
  if (aseq) free(aseq);
  return status;
}
示例#16
0
/* Function:  p7_filtermx_DumpMFRow()
 * Synopsis:  Dump one row from MSV version of a DP matrix.
 *
 * Purpose:   Dump current row of MSV calculations from DP matrix <fx>
 *            for diagnostics, and include the values of specials
 *            <xE>, etc. The index <rowi> for the current row is used
 *            as a row label. This routine has to be specialized for
 *            the layout of the MSVFilter() row, because it's all
 *            match scores dp[0..q..Q-1], rather than triplets of
 *            M,D,I.
 * 
 *            If <rowi> is 0, print a header first too.
 * 
 *            The output format is coordinated with <p7_refmx_Dump()> to
 *            facilitate comparison to a known answer.
 *            
 *            This also works for an SSV filter row, for SSV implementations
 *            that use a single row of DP memory (like <_longtarget>). 
 *            The Knudsen assembly code SSV does not use any RAM.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> on allocation failure. 
 */
int
p7_filtermx_DumpMFRow_neon64(const P7_FILTERMX *fx, int rowi, uint8_t xE, uint8_t xN, uint8_t xJ, uint8_t xB, uint8_t xC)
{
 #ifdef HAVE_NEON64 
  int      Q  = P7_NVB(fx->M);	/* number of vectors in the MSV row */
  uint8_t *v  = NULL;		/* array of scores after unstriping them */
  int      q,z,k;
  union { esl_neon_128i_t v; uint8_t i[16]; } tmp;
  int      status;

  ESL_DASSERT1( (fx->type == p7F_MSVFILTER || fx->type == p7F_SSVFILTER) );

  /* We'll unstripe the whole row; then print it in its normal order. */
  ESL_ALLOC(v, sizeof(unsigned char) * ((Q*16)+1));
  v[0] = 0;

  /* Header (if we're on the 0th row)  */
  if (rowi == 0)
    {
      fprintf(fx->dfp, "       ");
      for (k = 0; k <= fx->M;  k++) fprintf(fx->dfp, "%3d ", k);
      fprintf(fx->dfp, "%3s %3s %3s %3s %3s\n", "E", "N", "J", "B", "C");
      fprintf(fx->dfp, "       ");
      for (k = 0; k <= fx->M+5;  k++) fprintf(fx->dfp, "%3s ", "---");
      fprintf(fx->dfp, "\n");
    }

  /* Unpack and unstripe, then print M's. */
  for (q = 0; q < Q; q++) {
    tmp.v = fx->dp[q];
    for (z = 0; z < 16; z++) v[q+Q*z+1] = tmp.i[z]; 
  }
  fprintf(fx->dfp, "%4d M ", rowi);
  for (k = 0; k <= fx->M; k++) fprintf(fx->dfp, "%3d ", v[k]);

  /* The specials */
  fprintf(fx->dfp, "%3d %3d %3d %3d %3d\n", xE, xN, xJ, xB, xC);

  /* I's are all 0's; print just to facilitate comparison to refmx. */
  fprintf(fx->dfp, "%4d I ", rowi);
  for (k = 0; k <= fx->M; k++) fprintf(fx->dfp, "%3d ", 0);
  fprintf(fx->dfp, "\n");

  /* D's are all 0's too */
  fprintf(fx->dfp, "%4d D ", rowi);
  for (k = 0; k <= fx->M; k++) fprintf(fx->dfp, "%3d ", 0);
  fprintf(fx->dfp, "\n\n");

  free(v);
  return eslOK;

ERROR:
  free(v);
  return status;
#endif //HAVE_NEON64
 #ifndef HAVE_NEON64
 return eslENORESULT;
 #endif   
}
示例#17
0
/* guaranteed s1 >= -INFTY, s2 >= -INFTY */
int 
ILogsumNI(int s1, int s2)
{
  ESL_DASSERT1((s1 > -INFTY));
  ESL_DASSERT1((s2 > -INFTY));
  /*assert(s1 > -INFTY);
    assert(s2 > -INFTY);*/

  const int max = ESL_MAX(s1, s2);
  const int min = ESL_MIN(s1, s2);
  return  ((max-min) >= LOGSUM_TBL) ? max : max + ilogsum_lookup[max-min];
  /* about 10% slower 
     if(s1 > s2) 
    return  ((s1-s2) >= LOGSUM_TBL) ? s1 : s1 + ilogsum_lookup[s1-s2];
    else
    return  ((s2-s1) >= LOGSUM_TBL) ? s2 : s2 + ilogsum_lookup[s2-s1];
  */
} 
示例#18
0
/* guaranteed s1 >= -INFTY, s2 >= -INFTY */
int 
ILogsumNI_diff(int s1a, int s1b, int s2a, int s2b, int db)
{
  /* db = s1b - s2b */
  ESL_DASSERT1((s1a > -INFTY));
  ESL_DASSERT1((s1b > -INFTY));
  ESL_DASSERT1((s2a > -INFTY));
  ESL_DASSERT1((s2b > -INFTY));
  /*const int d = s1a-s2a+db;
  if      (d >=  LOGSUM_TBL) return s1a + s1b;
  else if (d > 0)            return s1a + s1b + ilogsum_lookup[d];
  else if (d <= -LOGSUM_TBL) return s2a + s2b;
  else                       return s2a + s2b + ilogsum_lookup[-d];*/
  const int d = s1a-s2a+db;
  if(d > 0) 
    return  (d >= LOGSUM_TBL) ? s1a + s1b : s1a + s1b + ilogsum_lookup[d];
  else
    return  (d <= LOGSUM_TBL) ? s2a + s2b : s2a + s2b + ilogsum_lookup[-d];
} 
示例#19
0
/* Function:  esl_msaweight_PB()
* Synopsis:  PB (position-based) weights.
* Incept:    SRE, Sun Nov  5 08:59:28 2006 [Janelia]
*
* Purpose:   Given a multiple alignment <msa>, calculate sequence
*            weights according to the position-based weighting
*            algorithm (Henikoff and Henikoff, JMB 243:574-578,
*            1994). These weights are stored internally in the <msa>
*            object, replacing any weights that may have already been
*            there. Weights are $\geq 0$ and they sum to <msa->nseq>.
*            
*            The <msa> may be in either digitized or text mode.
*            Digital mode is preferred, so that the algorithm
*            deals with degenerate residue symbols properly.
*            
*            The Henikoffs' algorithm does not give rules for dealing
*            with gaps or degenerate residue symbols. The rule here
*            is to ignore them. This means that longer sequences
*            initially get more weight; hence a "double
*            normalization" in which the weights are first divided by
*            sequence length in canonical residues (to compensate for
*            that effect), then normalized to sum to nseq.
*            
*            An advantage of the PB method is efficiency.
*            It is $O(1)$ in memory and $O(NL)$ time, for an alignment of
*            N sequences and L columns. This makes it a good method 
*            for ad hoc weighting of very deep alignments.
*            
*            When the alignment is in simple text mode, IUPAC
*            degenerate symbols are not dealt with correctly; instead,
*            the algorithm simply uses the 26 letters as "residues"
*            (case-insensitively), and treats all other residues as
*            gaps.
*
* Returns:   <eslOK> on success, and the weights inside <msa> have been
*            modified. 
*
* Throws:    <eslEMEM> on allocation error, in which case <msa> is
*            returned unmodified.
*
* Xref:      [Henikoff94b]; squid::weight.c::PositionBasedWeights().
*/
int
esl_msaweight_PB(ESL_MSA *msa)
{
    int    *nres = NULL;   	/* counts of each residue observed in a column */
    int     ntotal;		/* number of different symbols observed in a column */
    int     rlen;			/* number of residues in a sequence */
    int     idx, pos, i;
    int     K;			/* alphabet size */
    int     status;

    /* Contract checks
    */
    ESL_DASSERT1( (msa->nseq >= 1) );
    ESL_DASSERT1( (msa->alen >= 1) );
    if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

    /* Initialize
    */
    if (! (msa->flags & eslMSA_DIGITAL)) 
    { ESL_ALLOC_WITH_TYPE(nres, int*, sizeof(int) * 26);          K = 26;          }
示例#20
0
/* Function:  esl_sxp_cdf()
 *
 * Purpose:   Calculates the cumulative distribution function for the 
 *            stretched exponential pdf, $P(X \leq x)$, given
 *            quantile <x>, offset <mu>, and parameters <lambda> and <tau>.
 */
double
esl_sxp_cdf(double x, double mu, double lambda, double tau)
{
  double y = lambda * (x-mu);
  double val;

  if (x <= mu) return 0.;
  esl_stats_IncompleteGamma(1/tau, exp(tau * log(y)), &val, NULL);
  
  ESL_DASSERT1 (( !isnan(val)));
  return val;
}
/* jukescantor()
 * 
 * The generalized Jukes/Cantor distance calculation.
 * Given <n1> identities and <n2> differences, for a
 * base alphabet size of <alphabet_size> (4 or 20);
 * calculate J/C distance in substitutions/site and
 * return it in <ret_distance>; calculate large-sample
 * variance and return it in <ret_variance>.
 *
 * Returns <eslEDIVZERO> if there are no data (<n1+n2=0>).
 */
static int
jukescantor(int n1, int n2, int alphabet_size, double *opt_distance, double *opt_variance)
{
  int    status;
  double D, K, N;
  double x;
  double distance, variance;

  ESL_DASSERT1( (n1 >= 0) );
  ESL_DASSERT1( (n2 >= 0) );
  ESL_DASSERT1( (alphabet_size >= 0) );

  if (n1+n2 == 0) { status = eslEDIVZERO; goto ERROR; }

  K = (double) alphabet_size;
  D = (double) n2 / (double) (n1+n2);
  N = (double) (n1+n2);

  x = 1. - D * K/(K-1.);
  if (x <= 0.) 
    {
      distance = HUGE_VAL;
      variance = HUGE_VAL;
    }
  else
    {
      distance =   -log(x) * K/(K-1);
      variance =  exp( 2.*K*distance/(K-1) ) * D * (1.-D) / N;
    }
  if (opt_distance != NULL)  *opt_distance = distance;
  if (opt_variance != NULL)  *opt_variance = variance;
  return eslOK;

 ERROR:
  if (opt_distance != NULL)  *opt_distance = HUGE_VAL;
  if (opt_variance != NULL)  *opt_variance = HUGE_VAL;
  return status;
}
示例#22
0
/* Function:  p7_masstrace_FinishCount()
 * Synopsis:  Convert counted histograms to cumulative endpoint prob distributions.
 *
 * Purpose:   We've finished collecting endpoints from traces with
 *            <_CountTrace()> in <mt>, <ntr> of which had the
 *            specified domain anchor; now convert the counts to
 *            <mt>'s cumulative probability distributions.  
 *
 * Args:      mt  - mass trace object we've collected endpoint counts in
 *            ntr - number of traces we counted into <mt> that contained the domain anchor
 *
 * Returns:   <eslOK> on success; <mt> is now a valid <P7_MASSTRACE> object
 *            containing envelope endpoint cumulative probability distributions.
 */
int
p7_masstrace_FinishCount(P7_MASSTRACE *mt, int ntr)
{
  int i,k;

  ESL_DASSERT1( (ntr > 0) );
  ESL_DASSERT1( (mt->i0)  );
  ESL_DASSERT1( (mt->k0)  );
  ESL_DASSERT1( (p7_trace_IsMain(mt->st0)) );

  if (mt->imass)
    {
      for (i = 1;     i < mt->i0; i++) mt->imass[i] += mt->imass[i-1];
      for (i = mt->L; i > mt->i0; i--) mt->imass[i] += mt->imass[i+1];
      esl_vec_FScale(mt->imass+1, mt->L, 1./(float) ntr);
      mt->imass[mt->i0] = 1.;
    }
  for (k = 1;     k < mt->k0; k++) mt->kmass[k] += mt->kmass[k-1];
  for (k = mt->M; k > mt->k0; k--) mt->kmass[k] += mt->kmass[k+1];
  esl_vec_FScale(mt->kmass+1, mt->M, 1./(float) ntr);
  mt->kmass[mt->k0] = 1.;
  return eslOK;
}
示例#23
0
/* Function:  esl_msaweight_BLOSUM()
 * Synopsis:  BLOSUM weights.
 * Incept:    SRE, Sun Nov  5 09:52:41 2006 [Janelia]
 *
 * Purpose:   Given a multiple sequence alignment <msa> and an identity
 *            threshold <maxid>, calculate sequence weights using the
 *            BLOSUM algorithm (Henikoff and Henikoff, PNAS
 *            89:10915-10919, 1992). These weights are stored
 *            internally in the <msa> object, replacing any weights
 *            that may have already been there. Weights are $\geq 0$
 *            and they sum to <msa->nseq>.
 *            
 *            The algorithm does a single linkage clustering by
 *            fractional id, defines clusters such that no two clusters
 *            have a pairwise link $\geq$ <maxid>), and assigns
 *            weights of $\frac{1}{M_i}$ to each of the $M_i$
 *            sequences in each cluster $i$. The <maxid> threshold
 *            is a fractional pairwise identity, in the range
 *            $0..1$.
 *            
 *            The <msa> may be in either digitized or text mode.
 *            Digital mode is preferred, so that the pairwise identity
 *            calculations deal with degenerate residue symbols
 *            properly.
 *
 * Returns:   <eslOK> on success, and the weights inside <msa> have been
 *            modified. 
 *            
 * Throws:    <eslEMEM> on allocation error. <eslEINVAL> if a pairwise
 *            identity calculation fails because of corrupted sequence 
 *            data. In either case, the <msa> is unmodified.
 *
 * Xref:      [Henikoff92]; squid::weight.c::BlosumWeights().
 */
int
esl_msaweight_BLOSUM(ESL_MSA *msa, double maxid)
{
  int  *c    = NULL; /* cluster assignments for each sequence */
  int  *nmem = NULL; /* number of seqs in each cluster */
  int   nc;	     /* number of clusters  */
  int   i;           /* loop counter */
  int   status;

  /* Contract checks
   */
  ESL_DASSERT1( (maxid >= 0. && maxid <= 1.) );
  ESL_DASSERT1( (msa->nseq >= 1) );
  ESL_DASSERT1( (msa->alen >= 1) );
  if (msa->nseq == 1) { msa->wgt[0] = 1.0; return eslOK; }

  if ((status = esl_msacluster_SingleLinkage(msa, maxid, &c, NULL, &nc)) != eslOK) goto ERROR;
  ESL_ALLOC(nmem, sizeof(int) * nc);
  esl_vec_ISet(nmem, nc, 0);
  for (i = 0; i < msa->nseq; i++) nmem[c[i]]++;
  for (i = 0; i < msa->nseq; i++) msa->wgt[i] = 1. / (double) nmem[c[i]];

  /* Make weights normalize up to nseq, and return.
   */
  esl_vec_DNorm(msa->wgt, msa->nseq);
  esl_vec_DScale(msa->wgt, msa->nseq, (double) msa->nseq);	
  msa->flags |= eslMSA_HASWGTS;

  free(nmem);
  free(c);
  return eslOK;

 ERROR:
  if (c    != NULL) free(c);
  if (nmem != NULL) free(nmem);
  return status;
}
示例#24
0
static int 
do_by_windows(ESL_GENCODE *gcode, ESL_GENCODE_WORKSTATE *wrk, ESL_SQFILE *sqfp)
{
  ESL_SQ *sq = esl_sq_CreateDigital(gcode->nt_abc);
  int     windowsize  = 4092;                // can be any value, but a multiple of 3 makes most sense. windowsize can be +/-; + means reading top strand; - means bottom strand.
  int     contextsize = 2;                   // contextsize (adjacent window overlap) must be 2, or translation won't work properly.
  int     wstatus;

  ESL_DASSERT1(( windowsize  % 3 == 0 ));  

  while (( wstatus = esl_sqio_ReadWindow(sqfp, contextsize, windowsize, sq)) != eslEOF)
    {
      if (wstatus == eslEOD)
	{
	  if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick))
	    esl_gencode_ProcessEnd(wrk, sq);

	  if (windowsize > 0 && ! wrk->do_crick) { esl_sq_Reuse(sq); continue; } // Don't switch to revcomp if we don't need do. Allows -W --watson to work on nonrewindable streams
	  if (windowsize < 0) esl_sq_Reuse(sq);             // Do not Reuse the sq on the switch from watson to crick; ReadWindow needs sq->L
	  windowsize = -windowsize;                         // switch to other strand.
	  continue;
	}
      else if (wstatus == eslEFORMAT) esl_fatal("Parsing failed in sequence file %s:\n%s",          sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
      else if (wstatus == eslEINVAL)  esl_fatal("Invalid residue(s) found in sequence file %s\n%s", sqfp->filename, esl_sqfile_GetErrorBuf(sqfp));
      else if (wstatus != eslOK)      esl_fatal("Unexpected error %d reading sequence file %s", wstatus, sqfp->filename);

      /* If we're the first window in this input DNA sequence 
       * (or the first window in its revcomp), then initialize.
       * sq->C is the actual context overlap; 0=1st window; 2 (i.e. C)= subsequent.
       */
      if (sq->C == 0) 
	{
	  if (sq->n < 3) continue; // DNA sequence too short; skip it, don't even bother to revcomp, go to next sequence.
	  if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick))
	    esl_gencode_ProcessStart(gcode, wrk, sq);
	}

      if ( (windowsize > 0 && wrk->do_watson) || (windowsize < 0 && wrk->do_crick))      
	esl_gencode_ProcessPiece(gcode, wrk, sq);
    }
  esl_sq_Destroy(sq);
  return eslOK;
}
示例#25
0
/* Function:  p7_filtermx_Create()
 * Synopsis:  Create a one-row DP matrix for MSV, VF.
 *
 * Purpose:   Allocate a reusable, resizeable one-row <P7_FILTERMX>
 *            suitable for MSV and Viterbi filter calculations on 
 *            query profiles of up to <allocM> consensus positions.
 *            
 *            <allocM> must be $\leq$ 100,000. This is an H3 design
 *            limit.
 *            
 * Args:      allocM - initial allocation size, in profile positions. (>=1, <=100000)
 *
 * Returns:   ptr to new <P7_FILTERMX>
 *
 * Throws:    <NULL> on allocation failure.
 */
P7_FILTERMX *
p7_filtermx_Create_neon64(int allocM)
{
#ifdef HAVE_NEON64
  P7_FILTERMX *fx = NULL;
  int          status;

  /* Contract checks / argument validation */
  ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) );

  ESL_ALLOC(fx, sizeof(P7_FILTERMX));
  fx->simd = NEON64;
  fx->M         = 0;
  fx->dp        = NULL;
  fx->dp_mem    = NULL;
  fx->allocM    = 0;
 

  fx->type      = p7F_NONE;
#ifdef p7_DEBUGGING
  fx->do_dumping= FALSE;
  fx->dfp       = NULL;
#endif 
  
  // ISA we're using
  /*                    16B per vector  * (MDI)states *  ~M/4 vectors    + alignment slop */
  ESL_ALLOC(fx->dp_mem, (sizeof(esl_neon_128i_t) * p7F_NSCELLS * P7_NVW(allocM)) + (p7_VALIGN-1));
  fx->allocM = allocM;

  /* Manual memory alignment incantation: */
  fx->dp = (esl_neon_128i_t *) ( (unsigned long int) (  (char *) fx->dp_mem + (p7_VALIGN-1) ) & p7_VALIMASK);

  return fx;

 ERROR:
  p7_filtermx_Destroy(fx);
  return NULL;
 #endif //HAVE_NEON64
 #ifndef HAVE_NEON64
 return NULL;
 #endif 
}
示例#26
0
/* Function:  p7_filtermx_Create()
 * Synopsis:  Create a one-row DP matrix for MSV, VF.
 *
 * Purpose:   Allocate a reusable, resizeable one-row <P7_FILTERMX>
 *            suitable for MSV and Viterbi filter calculations on 
 *            query profiles of up to <allocM> consensus positions.
 *            
 *            <allocM> must be $\leq$ 100,000. This is an H3 design
 *            limit.
 *            
 * Args:      allocM - initial allocation size, in profile positions. (>=1, <=100000)
 *
 * Returns:   ptr to new <P7_FILTERMX>
 *
 * Throws:    <NULL> on allocation failure.
 */
P7_FILTERMX *
p7_filtermx_Create_avx(int allocM)
{
  P7_FILTERMX *fx = NULL;
  int          status;

  /* Contract checks / argument validation */
  ESL_DASSERT1( (allocM >= 1 && allocM <= 100000) );

 #ifdef HAVE_AVX2
  ESL_ALLOC(fx, sizeof(P7_FILTERMX));
  fx->simd = AVX;
  fx->dp_AVX    = NULL;
  fx->dp_mem_AVX = NULL;
  fx->allocM_AVX = 0;

  fx->type      = p7F_NONE;
#ifdef p7_DEBUGGING
  fx->do_dumping= FALSE;
  fx->dfp       = NULL;
#endif 
  
  /*                              32B per vector  * (MDI)states *  ~M/4 vectors    + alignment slop */
  ESL_ALLOC(fx->dp_mem_AVX, (sizeof(__m256i) * p7F_NSCELLS * P7_NVW_AVX(allocM)) + (p7_VALIGN_AVX-1));
  fx->allocM_AVX = allocM;

  /* Manual memory alignment incantation: */
  fx->dp_AVX = (__m256i *) ( (unsigned long int) (  (char *) fx->dp_mem_AVX + (p7_VALIGN_AVX-1) ) & p7_VALIMASK_AVX);
 
  return fx;

 ERROR:
  p7_filtermx_Destroy(fx);
  return NULL;
  #endif //HAVE_AVX2
 #ifndef HAVE_AVX2
 return NULL;
 #endif   
}
示例#27
0
/* wei_binned_func():
 * Returns the negative log likelihood of a binned data sample,
 * in the API of the conjugate gradient descent optimizer in esl_minimizer.
 */
static double
wei_binned_func(double *p, int nparam, void *dptr)
{
  struct wei_binned_data *data = (struct wei_binned_data *) dptr;
  ESL_HISTOGRAM          *h    = data->h;
  double lambda, tau;
  double logL;
  double ai,bi;
  int    i; 
  double tmp;
    
  /* Unpack what the optimizer gave us.
   */
  lambda = exp(p[0]); /* see below for c.o.v. notes */
  tau    = exp(p[1]);

  logL = 0.;
  for (i = h->cmin; i <= h->imax; i++)
    {
      if (h->obs[i] == 0) continue;

      ai = esl_histogram_Bin2LBound(h,i);
      bi = esl_histogram_Bin2UBound(h,i);
      if (ai < data->mu) ai = data->mu;

      tmp = esl_wei_cdf(bi, data->mu, lambda, tau) -
            esl_wei_cdf(ai, data->mu, lambda, tau);

      /* for cdf~1.0, numerical roundoff error can create tmp<0 by a
       * teensy amount; tolerate that, but catch anything worse */
      ESL_DASSERT1( (tmp + 1e-7 > 0.)); 
      if (tmp <= 0.) return eslINFINITY;

      logL += h->obs[i] * log(tmp);
    }
  return -logL;			/* goal: minimize NLL */
}
示例#28
0
/* Function:  esl_mixgev_FitComplete()
 *
 * Purpose:   Given <n> observed data values <x[0..n-1]>, and
 *            an initial guess at a mixture GEV fit to those data
 *            <mg>, use conjugate gradient descent to perform
 *            a locally optimal maximum likelihood mixture
 *            GEV parameter fit to the data.
 *            
 *            To obtain a reasonable initial guess for <mg>,
 *            see <esl_mixgev_FitGuess()>. 
 *
 * Args:      x   - observed data, <x[0..n-1]>.
 *            n   - number of samples in <x>
 *            mg  - mixture GEV to estimate, w/ params set to
 *                  an initial guess.
 *
 * Returns:   <eslOK> on success, and <mg> contains local
 *            ML estimate for mixture GEV parameters.
 *
 * Throws:    <eslEMEM> on allocation error, and <mg> is unchanged
 *            from its initial state.
 */
int
esl_mixgev_FitComplete(double *x, int n, ESL_MIXGEV *mg)
{
  struct mixgev_data data;
  int     status;
  double *p = NULL;
  double *u = NULL;
  double *wrk = NULL;
  double  tol;
  int     np;
  double  fx;
  int     k;
  int     i;

  tol = 1e-6;

  /* Determine number of free parameters and allocate 
   */
  np = mg->K-1;			/* K-1 mix coefficients free */
  for (k = 0; k < mg->K; k++)
    np += (mg->isgumbel[k])? 2 : 3;
  ESL_ALLOC(p,   sizeof(double) * np);
  ESL_ALLOC(u,   sizeof(double) * np);
  ESL_ALLOC(wrk, sizeof(double) * np * 4);

  /* Copy shared info into the "data" structure
   */
  data.x   = x;
  data.n   = n;
  data.wrk = wrk;
  data.mg  = mg;

  /* From mg, create the parameter vector.
   */
  mixgev_pack_paramvector(p, np, mg);

  /* Define the step size vector u.
   */
  i = 0;
  for (k = 1; k < mg->K; k++) u[i++] = 1.0;
  for (k = 0; k < mg->K; k++)
    {
      u[i++] = 1.0;
      u[i++] = 1.0;
      if (! mg->isgumbel[k]) u[i++] = 0.02;
    }
  ESL_DASSERT1( (np == i) );

  /* Feed it all to the mighty optimizer.
   */

  status = esl_min_ConjugateGradientDescent(p, u, np, &mixgev_complete_func, NULL,
					    (void *) (&data), tol, wrk, &fx);
  if (status != eslOK) goto ERROR;

  /* Convert the final parameter vector back to a mixture GEV
   */
  mixgev_unpack_paramvector(p, np, mg);
  
  free(p);
  free(u);
  free(wrk);
  return eslOK;

 ERROR:
  if (p != NULL)   free(p);
  if (u != NULL)   free(u);
  if (wrk != NULL) free(wrk);
  return status;
}
示例#29
0
/* Function:  esl_keyhash_CreateCustom()
 * Synopsis:  Allocate a new keyhash with customized initial allocations.
 *
 * Purpose:   Create a new hash table, initially allocating for
 *            a hash table of size <hashsize> entries, <kalloc> 
 *            keys, and a total key string length of <salloc>.
 *            <hashsize> must be a power of 2, and all allocations
 *            must be $\geq 0$. 
 *            
 *            The object will still expand as needed, so the reason to
 *            use a customized allocation is when you're trying to
 *            minimize memory footprint and you expect your keyhash to
 *            be smaller than the default (of up to 128 keys, of total
 *            length up to 2048).
 *
 * Throws:    <NULL> on allocation failure.
 */
ESL_KEYHASH *
esl_keyhash_CreateCustom(uint32_t hashsize, int kalloc, int salloc)
{
  ESL_DASSERT1((hashsize && ((hashsize & (hashsize-1)) == 0))); /* hashsize is a power of 2 (bitshifting trickery) */
  return keyhash_create(hashsize, kalloc, salloc);
}
static int
profillic_esl_msafile_profile_Read(ESLX_MSAFILE *afp, ESL_MSA **ret_msa, ProfileType * profile_ptr )
{
  /// \note Right now this isn't actually using the open file pointer; for convenience I just use the profile.fromFile( <filename> ) method.
  /// \todo Use convenience fns in esl_buffer.h; see eg hmmer-3.1/easel/esl_msafile_stockholm.c for examples...
  ESL_MSA                 *msa      = NULL;
  string profile_string;
  char *buf;
  long len;
  int                      seqidx;
  int                      status;
  char       errmsg2[eslERRBUFSIZE];

  ESL_DASSERT1((afp->format == eslMSAFILE_PROFILLIC));

  const char * const seqname = "Galosh Profile Consensus";
  const char * const msaname = "Galosh Profile";
  uint32_t profile_length;
  galosh::Sequence<typename ProfileType::ProfileResidueType> consensus_sequence;
  stringstream tmp_consensus_output_stream;

  uint32_t pos_i;

  if (profile_ptr == NULL)  { ESL_EXCEPTION(eslEINCONCEIVABLE, "profile_ptr is NULL in profillic_esl_msafile_profile_Read(..)!"); }
  //if (feof(afp->bf->fp))  { status = eslEOF; goto ERROR; }
  afp->errmsg[0] = '\0';

  // Read in the galosh profile (from profillic)
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end
  //len = afp->bf->ftell( afp->bf->fp ); // get the position at the end (length)
  //fseek( afp->bf->fp, 0, SEEK_SET ); // go to the beginning again.

  //ESL_ALLOC_CPP( char, buf, sizeof( char ) * len ); //malloc buffer
  //fread( buf, len, 1, afp->bf->fp ); //read into buffer

  //profile_string = buf;
  //profile_ptr->fromString( profile_string );
  profile_ptr->fromFile( afp->bf->filename );
  //if (buf)      free(buf);
  // \todo WHY WON'T THIS WORK?  See HACKs in profillic-hmmbuild.cpp to work around it.
  //fseek( afp->bf->fp, 0, SEEK_END ); // go to the end (to signal there's no more profiles in the file, the next time we come to this function)

  // Calculate the consensus sequence.
  profile_length = profile_ptr->length();
  consensus_sequence.reinitialize( profile_length );
  for( pos_i = 0; pos_i < profile_length; pos_i++ ) {
    consensus_sequence[ pos_i ] =
      ( *profile_ptr )[ pos_i ][ galosh::Emission::Match ].maximumValueType();
  }
  tmp_consensus_output_stream << consensus_sequence;

  /* Allocate a growable MSA, and auxiliary parse data coupled to the MSA allocation */
#ifdef eslAUGMENT_ALPHABET
  if (afp->abc   &&  (msa = esl_msa_CreateDigital(afp->abc, 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }
#endif
  if (! afp->abc &&  (msa = esl_msa_Create(                 16, -1)) == NULL) { status = eslEMEM; goto ERROR; }


  // Set first-and-only seq to the consensus.  This should set sqlen[0] to the profile's length and set ax to have length 1 and ax[0] to be the sequence itself.  Also msa->sqname[0] to the "name" of that consensus sequence.

  /* if nec, make room for the new seq */
  if (msa->nseq >= msa->sqalloc && (status = esl_msa_Expand(msa)) != eslOK) return status; 
  seqidx = msa->nseq; // 0
  msa->nseq++; // = 1
  status = esl_strdup(seqname, -1, &(msa->sqname[seqidx]));
  // NOTE: Could add description of this "sequence" here, using esl_msa_SetSeqDescription(msa, seqidx, desc).
#ifdef eslAUGMENT_ALPHABET
  if (msa->flags & eslMSA_DIGITAL)
    {
      // NOTE (profillic): There was a bug in this; it had said .."esl_abc_dsqcat(msa->abc, " where it should have said .."esl_abc_dsqcat(msa->abc->inmap, "
      if((status = esl_abc_dsqcat(msa->abc->inmap, &(msa->ax[seqidx]), &(msa->sqlen[seqidx]), tmp_consensus_output_stream.str().c_str(), profile_length)) != eslOK) {
        /* invalid char(s), get informative error message */
        if (esl_abc_ValidateSeq(msa->abc, tmp_consensus_output_stream.str().c_str(), profile_length, afp->errmsg) != eslOK) 
          ESL_XFAIL(eslEFORMAT, errmsg2, "%s (line %d): %s", msa->sqname[0], afp->linenumber, afp->errmsg);
      }
    }
#endif
  if (! (msa->flags & eslMSA_DIGITAL))
    {
      status = esl_strcat(&(msa->aseq[seqidx]), 0, tmp_consensus_output_stream.str().c_str(), profile_length);
      msa->sqlen[seqidx] = profile_length;
    } 
  msa->alen = profile_length;

  /// \todo OR read in a fasta file of sequences too.
  /// \todo (Optional?) Set msa->name to the name of the profile (file?)
  esl_strdup(msaname, -1, &(msa->name));
  /// \todo make sure eslMSA_HASWGTS is FALSE .. OR set it to TRUE and set msa->wgt[idx] to 1.0.
  /// \note Could have secondary structure (per sequence) too. msa->ss[0]. msa->sslen[0] should be the same as msa->sqlen[0].
  /// \todo Investigate what msa->sa and msa->pp are for.

  /* Give the newly parsed MSA a good
   * going-over, and finalize the fields of the MSA data structure.
   * verify_parse will fill in errmsg if it sees a problem.
   */
  //if (verify_parse(msa, afp->errmsg) != eslOK) { status = eslEFORMAT; goto ERROR; } 

  if (( status = esl_msa_SetDefaultWeights(msa)) != eslOK) goto ERROR;

  if (ret_msa != NULL) *ret_msa = msa; else esl_msa_Destroy(msa);
  return eslOK;

 ERROR:
  if (msa != NULL)      esl_msa_Destroy(msa);
  if (ret_msa != NULL) *ret_msa = NULL;
  return status;
}