Ejemplo n.º 1
0
int 
ILogsum(int s1, int s2)
{
  const int max = ESL_MAX(-INFTY, ESL_MAX(s1, s2));
  const int min = ESL_MIN(s1, s2);
  return  (min <= -INFTY || (max-min) >= LOGSUM_TBL) ? max : max + ilogsum_lookup[max-min];
} 
Ejemplo n.º 2
0
int
esl_hmm_Forward(const ESL_DSQ *dsq, int L, const ESL_HMM *hmm, ESL_HMX *fwd, float *opt_sc)
{
  int   i, k, m;
  int   M     = hmm->M;
  float logsc = 0;
  float max;

  fwd->sc[0] = 0.0;

  if (L == 0) {
    fwd->sc[L+1] = logsc = log(hmm->pi[M]);
    if (opt_sc != NULL) *opt_sc = logsc;
    return eslOK;
  }

  max = 0.0;
  for (k = 0; k < M; k++) {
    fwd->dp[1][k] = hmm->eo[dsq[1]][k] * hmm->pi[k];
    max = ESL_MAX(fwd->dp[1][k], max);
  }
  for (k = 0; k < M; k++) {
    fwd->dp[1][k] /= max;
  }
  fwd->sc[1] = log(max);

  for (i = 2; i <= L; i++)
    {
      max = 0.0;
      for (k = 0; k < M; k++)
	{
	  fwd->dp[i][k] = 0.0;
	  for (m = 0; m < M; m++)
	    fwd->dp[i][k] += fwd->dp[i-1][m] * hmm->t[m][k];

	  fwd->dp[i][k] *= hmm->eo[dsq[i]][k];
	  
	  max = ESL_MAX(fwd->dp[i][k], max);
	}
      
      for (k = 0; k < M; k++)
	fwd->dp[i][k] /= max;
      fwd->sc[i] = log(max);
    }
	  
  
  fwd->sc[L+1] = 0.0;
  for (m = 0; m < M; m++) 
    fwd->sc[L+1] += fwd->dp[L][m] * hmm->t[m][M];
  fwd->sc[L+1] = log(fwd->sc[L+1]);

  logsc = 0.0;
  for (i = 1; i <= L+1; i++)
    logsc += fwd->sc[i];

  fwd->M = hmm->M;
  fwd->L = L;
  if (opt_sc != NULL) *opt_sc = logsc;
  return eslOK;
}
Ejemplo n.º 3
0
int
esl_hmm_Backward(const ESL_DSQ *dsq, int L, const ESL_HMM *hmm, ESL_HMX *bck, float *opt_sc)
{
  int   i,k,m;
  int   M     = hmm->M;
  float logsc = 0.0;
  float max;
  
  bck->sc[L+1] = 0.0;

  if (L == 0) {
    bck->sc[0] = logsc = log(hmm->pi[M]);
    if (opt_sc != NULL) *opt_sc = logsc;
    return eslOK;
  }
  
  max = 0.0;
  for (k = 0; k < M; k++)
    {
      bck->dp[L][k] = hmm->t[k][M];
      max = ESL_MAX(bck->dp[L][k], max);
    }
  for (k = 0; k < M; k++)
    bck->dp[L][k] /= max;
  bck->sc[L] = log(max);

  for (i = L-1; i >= 1; i--)
    {
      max = 0.0;
      for (k = 0; k < M; k++)
	{
	  bck->dp[i][k] = 0.0;
	  for (m = 0; m < M; m++)
	    bck->dp[i][k] += bck->dp[i+1][m] * hmm->eo[dsq[i+1]][m] * hmm->t[k][m];
	  
	  max = ESL_MAX(bck->dp[i][k], max);
	}

      for (k = 0; k < M; k++)
	bck->dp[i][k] /= max;
      bck->sc[i] = log(max);
    }

  bck->sc[0] = 0.0;
  for (m = 0; m < M; m++)
    bck->sc[0] += bck->dp[1][m] * hmm->eo[dsq[1]][m] * hmm->pi[m];
  bck->sc[0] = log(bck->sc[0]);

  logsc = 0.0;
  for (i = 0; i <= L; i++) 
    logsc += bck->sc[i];

  bck->M = hmm->M;
  bck->L = L;
  if (opt_sc != NULL) *opt_sc = logsc;
  return eslOK;
}  
Ejemplo n.º 4
0
/* Function:  esl_recorder_Read()
 * Synopsis:  Read next line of a stream through an <ESL_RECORDER>.
 * Incept:    SRE, Fri Dec 25 16:31:00 2009 [Casa de Gatos]
 *
 * Purpose:   Read the next line of the input stream that the
 *            <ESL_RECORDER> <rc> is recording. Return a ptr to
 *            it in <*opt_line>. Note that the <ESL_RECORDER> 
 *            deals with allocation and freeing of this line;
 *            if caller wants to keep it for something, it must
 *            make a copy immediately, because subsequent calls
 *            to <esl_recorder_*> functions may overwrite these
 *            internal memory buffers.
 *
 * Returns:   <eslOK> on success.
 *            <eslEOF> if no more lines exist in the stream.
 *
 * Throws:    <eslEMEM> on an allocation failure.
 */
int
esl_recorder_Read(ESL_RECORDER *rc, char **opt_line)
{
  int idx = (rc->ncurr - rc->baseline) % rc->nalloc;     /* index of line to read, in wrapped coords */
  int status;
  
  /* if currline <= lastline, we already have the line recorded;
   * else we need to read a new one from <fp> */
  if (rc->ncurr >= rc->nread)
    {
      /* if reading a new line would overwrite our marked start, grow */
      if ( rc->markline >= 0 && 
	   ((rc->ncurr - rc->baseline) % rc->nalloc == ((rc->markline - rc->baseline) % rc->nalloc)))
	{
	  int xtra = ESL_MAX(3, (rc->nalloc / 3));
	  status = esl_recorder_ResizeTo(rc, rc->nalloc + xtra);
	  if (status) goto ERROR;
	  idx = (rc->ncurr - rc->baseline) % rc->nalloc; 
	}

      rc->offset[idx] = ftello(rc->fp);
      status = esl_fgets(&(rc->line[idx]), &(rc->lalloc[idx]), rc->fp);
      if (status) goto ERROR;
      rc->nread++;
    }

  rc->ncurr++;
  if (opt_line) *opt_line = rc->line[idx];
  return eslOK;

 ERROR:
  if (opt_line) *opt_line = NULL;
  return status;
}
Ejemplo n.º 5
0
/* Function:  p7_GHybrid()
 * Synopsis:  The "hybrid" algorithm.
 * Incept:    SRE, Sat May 19 10:01:46 2007 [Janelia]
 *
 * Purpose:   The profile HMM version of the Hwa "hybrid" alignment
 *            algorithm \citep{YuHwa02}. The "hybrid" score is the
 *            maximum score in the Forward matrix. 
 *            
 *            Given a digital sequence <dsq> of length <L>, a profile
 *            <gm>, and DP matrix <mx> allocated for at least <gm->M>
 *            by <L> cells; calculate the probability of the sequence
 *            given the model using the Forward algorithm; return
 *            the calculated Forward matrix in <mx>, and optionally
 *            return the Forward score in <opt_fwdscore> and/or the
 *            Hybrid score in <opt_hybscore>.
 *           
 *            This is implemented as a wrapper around <p7_GForward()>.
 *            The Forward matrix and the Forward score obtained from
 *            this routine are identical to what <p7_GForward()> would
 *            return.
 *           
 *            The scores are returned in lod form.  To convert to a
 *            bitscore, the caller needs to subtract a null model lod
 *            score, then convert to bits.
 *           
 * Args:      dsq          - sequence in digitized form, 1..L
 *            L            - length of dsq
 *            gm           - profile. 
 *            gx           - DP matrix with room for an MxL alignment
 *            opt_fwdscore - optRETURN: Forward lod score in nats.
 *            opt_hybscore - optRETURN: Hybrid lod score in nats.
 *
 * Returns:   <eslOK> on success, and results are in <mx>, <opt_fwdscore>,
 *            and <opt_hybscore>.
 */
int
p7_GHybrid(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_fwdscore, float *opt_hybscore)
{
  float   F    = -eslINFINITY;
  float   H    = -eslINFINITY;
  float **dp   = gx->dp;
  int     i,k;
  int     status;

  if ((status = p7_GForward(dsq, L, gm, gx, &F)) != eslOK)  goto ERROR;
  for (i = 1; i <= L; i++)
    for (k = 1 ; k <= gm->M; k++)
      H = ESL_MAX(H, MMX(i,k));
  
  gx->M = gm->M;
  gx->L = L;
  if (opt_fwdscore != NULL) *opt_fwdscore = F;
  if (opt_hybscore != NULL) *opt_hybscore = H;
  return eslOK;

 ERROR:
  if (opt_fwdscore != NULL) *opt_fwdscore = 0;
  if (opt_hybscore != NULL) *opt_hybscore = 0;
  return status;
}
Ejemplo n.º 6
0
int p7_ViterbiCOPSw_run(DATA_COPS16* dcops, SEQ **seqsdb, float* results)
{
	int j, maxL = 0;
	for (j = 0; j < SSE16_NVALS; j++)
		if (seqsdb[j]->length > maxL)
			maxL = seqsdb[j]->length;

	if (maxL > dcops->allocL-10)
	{	// realloc
		dcops->allocL = ESL_MAX(dcops->allocL*2, roundtop(maxL,1024));
		free_cops_buffers(dcops);
		alloc_cops_buffers(dcops, dcops->allocL);
	}

	if (dcops->L != maxL)
	{   dcops->L  = maxL;
		p7_ReconfigLength(dcops->gm, maxL);
	}

	for (j = 0; j < SSE16_NVALS; j++)
		// must be copied since we run N seqs in parallel up to the max length of the longest
		memcpy(dcops->seqs[j], seqsdb[j]->seq, (seqsdb[j]->length+1)*sizeof(ESL_DSQ));
	
	// pad sequences with k = Alphasize
	byte alphSize = dcops->gm->abc->Kp;
	for (j = 0; j < SSE16_NVALS; j++)
		memset(dcops->seqs[j]+seqsdb[j]->length+1, alphSize, maxL-seqsdb[j]->length);

	for (j = 1; j <= dcops->gm->M/dcops->partition; j++)
		memset((void*) dcops->synchflags[j], 0, dcops->nflags);

	dcops->synccontrol++;
	p7_ViterbiCOPSw_partitioned_threaded(dcops, results, NTHREADS-1	);
	return maxL;
}
Ejemplo n.º 7
0
/* Open the source sequence database for negative subseqs;
 * upon return, cfg->dbfp is open (digital, SSI indexed);
 * cfg->db_maxL and cfg->db_nseq are set.
 */
static int
process_dbfile(struct cfg_s *cfg, char *dbfile, int dbfmt)
{
  ESL_SQ     *sq    = esl_sq_CreateDigital(cfg->abc);
  int         status;
  
  /* Open the sequence file in digital mode */
  status = esl_sqfile_OpenDigital(cfg->abc, dbfile, dbfmt, NULL, &(cfg->dbfp));
  if      (status == eslENOTFOUND) esl_fatal("No such file %s", dbfile);
  else if (status == eslEFORMAT)   esl_fatal("Format of seqfile %s unrecognized.", dbfile);
  else if (status == eslEINVAL)    esl_fatal("Can't autodetect stdin or .gz.");
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  /* Read info on each sequence */
  cfg->db_nseq   = 0;
  cfg->db_maxL   = 0;
  while ((status = esl_sqio_ReadInfo(cfg->dbfp, sq)) == eslOK) {
    cfg->db_maxL = ESL_MAX(sq->L, cfg->db_maxL);
    cfg->db_nseq++;
    esl_sq_Reuse(sq);
  }
  if (status != eslEOF) esl_fatal("Something went wrong with reading the seq db");

  /* Open SSI index */
  if (esl_sqfile_OpenSSI(cfg->dbfp, NULL) != eslOK) esl_fatal("Failed to open SSI index file");
  if (cfg->dbfp->data.ascii.ssi->nprimary != cfg->db_nseq)     esl_fatal("oops, nprimary != nseq");

  esl_sq_Destroy(sq);
  return eslOK;
}
Ejemplo n.º 8
0
/* Function:  esl_recorder_ResizeTo()
 * Synopsis:  Reallocate an <ESL_RECORDER> for a new <maxlines>
 * Incept:    SRE, Fri Dec 25 17:02:46 2009 [Casa de Gatos]
 *
 * Purpose:   Reallocate the <ESL_RECORDER> <rc> to have a new
 *            window size <maxlines>. 
 *            
 *            The new <maxlines> may be more or less than the previous
 *            window size for <rc>.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEMEM> if (re-)allocation fails.
 *
 *            <eslEINVAL> if the recorder has a marked line (for start
 *            of a block) and you try to shrink it so much that that
 *            marked line would be lost.
 *
 *            <eslEINCONCEIVABLE> on any baseline resetting problem;
 *            this would have to be an internal error in the module.
 * 
 * Note:      We may have to repermute the line array, and reset its
 *            baseline, as follows.
 *            
 *            In the growth case: if the line array is out of order
 *            (circularly permuted) we must straighten it out, which
 *            means resetting the baseline.  
 *            i.e. to grow 3 1 2 to nalloc=6, we need 1 2 3 x x x; 
 *            simple reallocation to 3 1 2 x x x doesn't work,
 *            next read would make 3 4 2 x x x.
 *          
 *            In the shrinkage case: if the line array is in use beyond the
 *            new array size, we set a new baseline to keep as much of the
 *            old array as possible.
 * 
 *            i.e. for 6->3  
 *            1 2 3 x x x -> 1 2 3
 *            1 2 3 4 x x -> 2 3 4 with new baseline=2.
 *            4 5 0 1 2 3 -> 3 4 5 with new baseline=3
 */
int
esl_recorder_ResizeTo(ESL_RECORDER *rc, int new_maxlines)
{
  int   idx;
  int   newbase;
  void *tmp;
  int   minlines;
  int   status;

  if (new_maxlines == rc->nalloc) return eslOK;

  if (new_maxlines > rc->nalloc) /* growth case */
    {
      if ((rc->nread - rc->baseline) / rc->nalloc != 0)	/* array is permuted; reorder it */
	{
	  newbase = ESL_MAX(rc->baseline, rc->nread - rc->nalloc);
	  status = recorder_new_baseline(rc, newbase);
	  if (status) ESL_EXCEPTION(eslEINCONCEIVABLE, "baseline reset failed unexpectedly");
	}
    }
  else 				/* shrinkage case */
    {
      /* check that the marked line (if any) will stay in window */
      if (rc->markline >= 0)	
	{
	  minlines = rc->nread - rc->markline;
	  if (new_maxlines < minlines)
	    ESL_EXCEPTION(eslEINVAL, "can't shrink that far without losing marked line");
	}
      /* check that current line will stay in window */
      minlines = rc->nread - rc->ncurr + 1;
      if (new_maxlines < minlines)
	ESL_EXCEPTION(eslEINVAL, "can't shrink that far without losing current line");

      if (rc->nread - rc->baseline > new_maxlines) /* baseline needs to move up */
	{
	  newbase = rc->nread - new_maxlines;
	  status = recorder_new_baseline(rc, newbase);
	  if (status) ESL_EXCEPTION(eslEINCONCEIVABLE, "baseline reset failed unexpectedly");
	}

      for (idx = new_maxlines; idx < rc->nalloc; idx++)
	if (rc->line[idx]) free(rc->line[idx]);
    }

  ESL_RALLOC(rc->line,   tmp, sizeof(char *) * new_maxlines);
  ESL_RALLOC(rc->lalloc, tmp, sizeof(int)    * new_maxlines);
  ESL_RALLOC(rc->offset, tmp, sizeof(off_t)  * new_maxlines);
  for (idx = rc->nalloc; idx < new_maxlines; idx++) /* no-op in shrinkage case */
    { 
      rc->line[idx]   = NULL;
      rc->lalloc[idx] = 0;
      rc->offset[idx] = 0;
    }
  rc->nalloc = new_maxlines;
  return eslOK;

 ERROR:
  return status;
}
Ejemplo n.º 9
0
float
LogSum2(float s1, float s2)
{
  const float max = ESL_MAX(s1, s2);
  const float min = ESL_MIN(s1, s2);
  return  (min == -eslINFINITY || (max-min) >= 23.f) ? max : max + flogsum_lookup[(int)((max-min)*INTSCALE)];
} 
Ejemplo n.º 10
0
/* Function:  p7_tophits_GetMaxPositionLength()
 * Synopsis:  Returns maximum position length in hit list (targets).
 *
 * Purpose:   Returns the length of the longest hit location (start/end)
 *               of all the registered hits, in chars. This is useful when
 *               deciding how to format output.
 *
 *            The maximum is taken over all registered hits. This
 *            opens a possible side effect: caller might print only
 *            the top hits, and the max name length in these top hits
 *            may be different than the max length over all the hits.
 *
 *            Used specifically for nhmmer output, so expects only one
 *            domain per hit
 *
 *            If there are no hits in <h>, or none of the
 *            hits have names, returns 0.
 */
int
p7_tophits_GetMaxPositionLength(P7_TOPHITS *h)
{
  int i, max, n;
  char buffer [11];

  for (max = 0, i = 0; i < h->N; i++) {
    if (h->unsrt[i].dcl[0].ia > 0) {
      n = sprintf (buffer, "%d", h->unsrt[i].dcl[0].ia);
      max = ESL_MAX(n, max);
      n = sprintf (buffer, "%d", h->unsrt[i].dcl[0].ib);
      max = ESL_MAX(n, max);
    }
  }
  return max;
}
Ejemplo n.º 11
0
/* tests: 
 *   1. each sampled trace must validate.
 *   2. each trace must be <= viterbi trace score
 *   3. in a large # of traces, one is "equal" to the viterbi trace score.
 *      (this of course is stochastic; but it's true for the particular
 *       choice of RNG seed used in tests here.)
 */
static void
utest_stotrace(ESL_GETOPTS *go, ESL_RANDOMNESS *rng, ESL_ALPHABET *abc, P7_PROFILE *gm, P7_OPROFILE *om, ESL_DSQ *dsq, int L, int ntrace)
{
  P7_GMX   *gx  = NULL;
  P7_OMX   *ox  = NULL;
  P7_TRACE *tr  = NULL;
  char      errbuf[eslERRBUFSIZE];
  int       idx;
  float     maxsc = -eslINFINITY;
  float     vsc, sc;

  if ((gx     = p7_gmx_Create(gm->M, L))        == NULL)  esl_fatal("generic DP matrix creation failed");
  if ((ox     = p7_omx_Create(gm->M, L, L))     == NULL)  esl_fatal("optimized DP matrix create failed");
  if ((tr     = p7_trace_Create())              == NULL)  esl_fatal("trace creation failed");

  if (p7_GViterbi(dsq, L, gm, gx, &vsc)         != eslOK) esl_fatal("viterbi failed");
  if (p7_Forward (dsq, L, om, ox, NULL)         != eslOK) esl_fatal("forward failed");

  for (idx = 0; idx < ntrace; idx++)
    {
      if (p7_StochasticTrace(rng, dsq, L, om, ox, tr) != eslOK) esl_fatal("stochastic trace failed");
      if (p7_trace_Validate(tr, abc, dsq, errbuf)     != eslOK) esl_fatal("trace invalid:\n%s", errbuf);
      if (p7_trace_Score(tr, dsq, gm, &sc)            != eslOK) esl_fatal("trace scoring failed"); 

      maxsc = ESL_MAX(sc, maxsc);
      if (sc > vsc) esl_fatal("sampled trace has score > optimal Viterbi path; not possible");
      p7_trace_Reuse(tr);
    }
  if (esl_FCompare(maxsc, vsc, 0.1) != eslOK) esl_fatal("stochastic trace failed to sample the Viterbi path");
  
  p7_trace_Destroy(tr);
  p7_omx_Destroy(ox);
  p7_gmx_Destroy(gx);
}
Ejemplo n.º 12
0
/* Function:  p7_oprofile_GetSSVEmissionScoreArray()
 * Synopsis:  Retrieve MSV residue emission scores from a
 *            profile into an array
 *
 * Purpose:   Extract an implicitly 2D array of 8-bit int MSV residue
 *            emission scores from a profile <om>. <arr> must
 *            be allocated by the calling function to be of size
 *            ( om->abc->Kp * ( om->M  + 1 )), and indexing into the array
 *            is done as  [om->abc->Kp * i +  c ] for character c at
 *            position i.
 *
 *            In the dummy implementation, we need to convert from the
 *            float emission probabilities to 8-bit int scores. Conversion
 *            is based on code from the function mf_conversion in impl_sse's
 *            p7_oprofile.c
 *
 * Args:      <om>   - profile, containing emission information
 *            <arr>  - preallocated array into which scores will be placed
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    (no abnormal error conditions)
 */
int
p7_oprofile_GetSSVEmissionScoreArray(const P7_OPROFILE *om, uint8_t *arr )
{
  int     M   = om->M;    /* length of the query                                          */
  int i, j;
  float x;
  float max = 0.0;
  float scale;
  uint8_t bias;

  /* scale and bias required for float->8bit conversion   */
  scale = 3.0 / eslCONST_LOG2;                    /* scores in units of third-bits */
  for (i = 0; i < om->abc->K; i++)  max = ESL_MAX(max, esl_vec_FMax(om->rsc[i], (M+1)*2));
  max = -1.0f * roundf(scale * -1.0 * max);   //based on unbiased_byteify
  bias   = (max > 255.) ? 255 : (uint8_t) max;


  for (i = 1; i <= om->M; i++) {
    for (j=0; j<om->abc->Kp; j++) {
      //based on p7_oprofile's biased_byteify()
      x =  -1.0f * roundf(scale * om->rsc[j][(i) * p7P_NR     + p7P_MSC]);
      arr[i*om->abc->Kp + j] = (x > 255. - bias) ? 255 : (uint8_t) (x + bias);
    }
  }


  return eslOK;
}
Ejemplo n.º 13
0
/* Function:  esl_recorder_MarkBlock()
 * Synopsis:  Mark first line to be saved in a block.
 * Incept:    SRE, Fri Jan  1 11:13:53 2010 [Magallon]
 *
 * Purpose:   Mark line number <markline> (0..N-1) in a file being read
 *            through the <ESL_RECORDER> <rc> as the first line in a
 *            block of lines to be parsed later, when the end of
 *            the block is found. 
 *            
 *            This mark makes sure that the <ESL_RECORDER> will keep
 *            the entire block of lines in memory, starting at or
 *            before the mark. When a mark is active,
 *            <esl_recorder_Read()> will reallocate and grow the
 *            recorder as necessary, rather than overwriting the mark.
 *
 * Returns:   <eslOK> on success.
 *
 * Throws:    <eslEINVAL> if the <markline> has already passed out
 *            of the recorder's memory.
 */
int
esl_recorder_MarkBlock(ESL_RECORDER *rc, int markline)
{
  int line0 = ESL_MAX(rc->baseline, rc->nread - rc->nalloc);
  
  if (markline < line0) ESL_EXCEPTION(eslEINVAL, "recorder window already passed marked line");
  rc->markline = markline;
  return eslOK;
}
Ejemplo n.º 14
0
/* Function:  p7_tophits_GetMaxShownLength()
 * Synopsis:  Returns max shown name/accession length in hit list.
 *
 * Purpose:   Same as <p7_tophits_GetMaxNameLength()>, but 
 *            for the case when --acc is on, where
 *            we show accession if one is available, and 
 *            fall back to showing the name if it is not.
 *            Returns the max length of whatever is being
 *            shown as the reported "name".
 */
int
p7_tophits_GetMaxShownLength(P7_TOPHITS *h)
{
  int i, max, n;
  for (max = 0, i = 0; i < h->N; i++)
  {
    if (h->unsrt[i].acc != NULL && h->unsrt[i].acc[0] != '\0')
    {
      n   = strlen(h->unsrt[i].acc);
      max = ESL_MAX(n, max);
    }
    else if (h->unsrt[i].name != NULL)
    {
      n   = strlen(h->unsrt[i].name);
      max = ESL_MAX(n, max);
    }
  }
  return max;
}
Ejemplo n.º 15
0
/* yes LogSum2 and FLogsum are identical, this is for backwards compatibility */
float
FLogsum(float s1, float s2)
{
  const float max = ESL_MAX(s1, s2);
  const float min = ESL_MIN(s1, s2);
#if 0
  return (min == -eslINFINITY || (max-min) >= 23.f) ? max : max + sreLOG2(1.0 + sreEXP2(min-max));  /* EPN: While debugging. Replaces logsum table with analytical calculation. Remember to remove! */
#endif
  return  (min == -eslINFINITY || (max-min) >= 23.f) ? max : max + flogsum_lookup[(int)((max-min)*INTSCALE)];
} 
Ejemplo n.º 16
0
/* Function:  p7_tophits_GetMaxAccessionLength()
 * Synopsis:  Returns maximum accession length in hit list (targets).
 *
 * Purpose:   Same as <p7_tophits_GetMaxNameLength()>, but for
 *            accessions. If there are no hits in <h>, or none
 *            of the hits have accessions, returns 0.
 */
int
p7_tophits_GetMaxAccessionLength(P7_TOPHITS *h)
{
  int i, max, n;
  for (max = 0, i = 0; i < h->N; i++)
    if (h->unsrt[i].acc != NULL) {
      n   = strlen(h->unsrt[i].acc);
      max = ESL_MAX(n, max);
    }
  return max;
}
Ejemplo n.º 17
0
/* Function:  p7_GMSV()
 * Synopsis:  The MSV score algorithm (slow, correct version)
 * Incept:    SRE, Thu Dec 27 08:33:39 2007 [Janelia]
 *
 * Purpose:   Calculates the maximal score of ungapped local segment
 *            pair alignments, taking advantage of the fact that this
 *            is simply equivalent to setting all MM transitions to 1.0
 *            in a multihit local profile.
 *            
 * Args:      dsq          - sequence in digitized form, 1..L
 *            L            - length of dsq
 *            gm           - profile (can be in any mode)
 *            gx           - DP matrix with room for an MxL alignment
 *            nu           - configuration: expected number of hits (use 2.0 as a default)
 *            opt_sc       - optRETURN: MSV lod score in nats.
 *
 * Returns:   <eslOK> on success.
 * 
 * Note:      This is written deliberately as a modified p7_GViterbi
 *            routine. It could be faster -- we don't need the
 *            interleaved dp matrix or residue scores, since we aren't
 *            calculating D or I states, for example, and we could do
 *            without some of the special states -- but speed is the
 *            job of the optimized implementations. Rather, the goal
 *            here is to establish a stable, probabilistically correct
 *            reference calculation. (Thus, the CC, NN, JJ transitions
 *            are real scores here, not fixed to 0 as in the optimized
 *            versions.)  
 */            
int
p7_GMSV(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float nu, float *opt_sc)
{
  float      **dp    = gx->dp;
  float       *xmx   = gx->xmx;
  float        tloop = logf((float) L / (float) (L+3));
  float        tmove = logf(     3.0f / (float) (L+3));
  float        tbmk  = logf(     2.0f / ((float) gm->M * (float) (gm->M+1)));
  float        tej   = logf((nu - 1.0f) / nu);
  float        tec   = logf(1.0f / nu);
  int          i,k;

  XMX(0,p7G_N) = 0;
  XMX(0,p7G_B) = tmove;                                      /* S->N->B, no N-tail   */
  XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) =-eslINFINITY;  /* need seq to get here */
  for (k = 0; k <= gm->M; k++)
    MMX(0,k) = -eslINFINITY;                                 /* need seq to get here */

  for (i = 1; i <= L; i++) 
    {
      float const *rsc = gm->rsc[dsq[i]];

      MMX(i,0)     = -eslINFINITY;
      XMX(i,p7G_E) = -eslINFINITY;
    
      for (k = 1; k <= gm->M; k++) 
	{
	  MMX(i,k)     = MSC(k) + ESL_MAX(MMX(i-1,k-1), XMX(i-1,p7G_B) + tbmk);
	  XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k));
	}
   
      XMX(i,p7G_J) = ESL_MAX( XMX(i-1,p7G_J) + tloop,     XMX(i, p7G_E) + tej);
      XMX(i,p7G_C) = ESL_MAX( XMX(i-1,p7G_C) + tloop,     XMX(i, p7G_E) + tec);
      XMX(i,p7G_N) =          XMX(i-1,p7G_N) + tloop;
      XMX(i,p7G_B) = ESL_MAX( XMX(i,  p7G_N) + tmove,     XMX(i, p7G_J) + tmove);
    }
  gx->M = gm->M;
  gx->L = L;
  if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + tmove;
  return eslOK;
}
Ejemplo n.º 18
0
float
p7_masstrace_GetMaxAbsDiff(const P7_MASSTRACE *mte, const P7_MASSTRACE *mta)
{
  int i,k;
  float diff;
  float max = 0.; 

  if (mte->imass && mta->imass)
    {
      for (i = 1; i <= mte->L; i++)
	{
	  diff = fabs(mte->imass[i] - mta->imass[i]); 
	  max  = ESL_MAX(diff, max);
	}
    }
  for (k = 1; k <= mte->M; k++)
    {
      diff = fabs(mte->kmass[k] - mta->kmass[k]); 
      max  = ESL_MAX(diff, max);
    }
  return max;
}
Ejemplo n.º 19
0
/* is_multidomain_region()
 * SRE, Fri Feb  8 11:35:04 2008 [Janelia]
 *
 * This defines the trigger for when we need to hand a "region" off to
 * a deeper analysis (using stochastic tracebacks and clustering)
 * because there's reason to suspect it may encompass two or more
 * domains. 
 * 
 * The criterion is to find the split point z at which the expected
 * number of E occurrences preceding B occurrences is maximized, and
 * if that number is greater than the heuristic threshold <ddef->rt3>,
 * then return TRUE. In other words, we're checking to see if there's
 * any point in the region at which it looks like an E was followed by
 * a B, as expected for a multidomain interpretation of the region.
 * 
 * More precisely: return TRUE if  \max_z [ \min (B(z), E(z)) ]  >= rt3
 * where
 *   E(z) = expected number of E states occurring in region before z is emitted
 *        = \sum_{y=i}^{z} eocc[i]  =  etot[z] - etot[i-1]
 *   B(z) = expected number of B states occurring in region after z is emitted
 *        = \sum_{y=z}^{j} bocc[i]  =  btot[j] - btot[z-1]               
 *        
 *        
 * Because this relies on the <ddef->etot> and <ddef->btot> arrays,
 * <calculate_domain_posteriors()> needs to have been called first.
 *
 * Xref:    J2/101.  
 */
static int
is_multidomain_region(P7_DOMAINDEF *ddef, int i, int j)
{
  int   z;
  float max;
  float expected_n;

  max = -1.0;
  for (z = i; z <= j; z++)
    {
      expected_n = ESL_MIN( (ddef->etot[z] - ddef->etot[i-1]), (ddef->btot[j] - ddef->btot[z-1]));
      max        = ESL_MAX(max, expected_n);
    }
  return ( (max >= ddef->rt3) ? TRUE : FALSE);
}
Ejemplo n.º 20
0
int p7_ViterbiStream(DATA_STREAM* dstream, dsq_cmp_t **seqsdb, float* results)
{
	int j, maxL = 0;

#if 0
	for (j = 0; j < 8; j++)
		if (seqsdb[j]->length > maxL)
			maxL = seqsdb[j]->length;

//	maxL = 2000;
//	for (j = 0; j < 8; j++)	printf("%d ", seqsdb[j]->length); 	printf("\n"); 

	if (maxL > dstream->allocL-10)
	{	// realloc
		dstream->allocL = ESL_MAX(dstream->allocL*2, roundtop(maxL,1024));
		free_stream_buffers(dstream);
		alloc_stream_buffers(dstream, dstream->allocL);
	}

	if (dstream->L != maxL)
	{   dstream->L  = maxL;
		p7_ReconfigLength(dstream->gm, maxL);
	}

	for (j = 0; j < 8; j++)
		memcpy(dstream->seqs[j], seqsdb[j]->seq, (seqsdb[j]->length+1)*sizeof(ESL_DSQ));
	
	// pad sequences with k = Alphasize
	for (j = 0; j < 8; j++)
		memset(dstream->seqs[j]+seqsdb[j]->length+1, dstream->gm->abc->Kp, maxL-seqsdb[j]->length);

	for (j = 1; j <= dstream->gm->M/dstream->partition; j++)
		memset((void*) dstream->synchflags[j], 0, dstream->nflags);

	for (j = 1; j <= dstream->gm->M/dstream->partition; j++)
		memset((void*) dstream->synchflags[j], 0, dstream->nflags);
#endif

if(1)
	for (j = 0; j < NTHREADS-1; j++)
	{	sem_post(&semsynch[j]);
		syncflags[j] = 1;
	}
	
	dstream->synccontrol++;
	viterbi_stream_word_partitioned(dstream, results, NTHREADS-1	);
	return maxL;
}
Ejemplo n.º 21
0
/* Function:  esl_recorder_Position()
 * Synopsis:  Reset the recorder to a new starting line position.
 * Incept:    SRE, Mon Dec 28 10:25:22 2009 [Casa de Gatos]
 *
 * Purpose:   Reset the recorder <rc> to a new line position <linenumber>,
 *            starting from 0. The next call to <esl_recorder_Read()>
 *            will read this line.
 *            
 *            The <linenumber> can be ahead of the furthest line read
 *            by the recorder so far, in which case it calls
 *            <esl_recorder_Read()> until it reaches the proper
 *            position. This can result in a return code of <eslEOF>,
 *            if no such line exists in the stream.
 *            
 *            If the <linenumber> falls before (outside) the
 *            recorder's history window, an <eslEINVAL> exception is
 *            thrown.
 *
 * Returns:   <eslOK> on success.
 *            <eslEOF> if <linenumber> is larger than current position
 *            in file, and the stream ends before line <linenumber> is
 *            reached.
 *
 * Throws:    <eslEMEM> on allocation failure; this can only happen
 *            if <linenumber> is larger than current position in
 *            file, forcing <esl_recorder_Read()> calls to reach that
 *            line.
 */
int 
esl_recorder_Position(ESL_RECORDER *rc, int linenumber)
{
  /* The recorder stores lines MAX(baseline,<nread-nalloc>)..<nread>-1 */
  int line0  = ESL_MAX(rc->baseline, rc->nread - rc->nalloc);
  int status;
  
  if (linenumber < line0)  
    ESL_EXCEPTION(eslEINVAL, "recorder's window is past that line");

  if (linenumber >= rc->nread) {
    while (rc->nread < linenumber)
      if ((status = esl_recorder_Read(rc, NULL)) != eslOK) return status;
  }
  
  rc->ncurr = linenumber;
  return eslOK;
}
Ejemplo n.º 22
0
/* guaranteed s1 >= -INFTY, s2 >= -INFTY */
int 
ILogsumNI(int s1, int s2)
{
  ESL_DASSERT1((s1 > -INFTY));
  ESL_DASSERT1((s2 > -INFTY));
  /*assert(s1 > -INFTY);
    assert(s2 > -INFTY);*/

  const int max = ESL_MAX(s1, s2);
  const int min = ESL_MIN(s1, s2);
  return  ((max-min) >= LOGSUM_TBL) ? max : max + ilogsum_lookup[max-min];
  /* about 10% slower 
     if(s1 > s2) 
    return  ((s1-s2) >= LOGSUM_TBL) ? s1 : s1 + ilogsum_lookup[s1-s2];
    else
    return  ((s2-s1) >= LOGSUM_TBL) ? s2 : s2 + ilogsum_lookup[s2-s1];
  */
} 
Ejemplo n.º 23
0
/* Function: DispatchSqBlockAlignment()
 * Date:     EPN, Fri Dec 30 14:59:43 2011
 *
 * Purpose:  Given a CM and a block of sequences, align the
 *           sequence(s) using the appropriate alignment function and
 *           return relevant data for eventual output in <ret_dataA>.
 *           This function simply calls DispatchSqAlignment() serially
 *           for each sequence in the block, and creates an array
 *           of the <ret_data> DispatchSqAlignment() returns.
 *
 *           Currently <mode>, <cp9b_valid> and <pass_idx> values sent
 *           to DispatchSqAlignment() are hard-coded to
 *           TRMODE_UNKNOWN, FALSE, and PLI_PASS_5P_AND_3P_FORCE (if
 *           cm->align_opts & CM_ALIGN_TRUNC) or PLI_PASS_STD_ANY (if
 *           (! cm->align_opts & CM_ALIGN_TRUNC)). This is because
 *           this function is only used by the alignment pipeline, in
 *           which these values are correct. If this changes, we may
 *           want caller to pass in an array of modes, cp9b_valids and
 *           pass_idx values, one per sq.
 *
 *           If (cm->flags & CM_ALIGN_XTAU) we'll potentially tighten
 *           HMM bands until the required DP matrices are below out
 *           limit (<mxsize>). cm->maxtau is the max allowed tau value
 *           during this iterative band tightening, and cm->xtau is
 *           the factor by which we multiply cm->tau at each iteration
 *           during band tightening.
 *
 * Args:     cm        - the covariance model
 *           errbuf    - char buffer for reporting errors
 *           sq_block  - block of sequences to align
 *           mxsize    - max size in Mb of allowable DP mx
 *           w         - stopwatch for timing individual stages
 *           w_tot     - stopwatch for timing total time per seq
 *           r         - RNG, req'd if CM_ALIGN_SAMPLE, can be NULL otherwise
 *           ret_dataA - RETURN: newly created array of CM_ALNDATA objects
 *
 * Returns:  eslOK on success;
 *           eslEINCOMPAT on contract violation, errbuf is filled;
 *           eslEMEM if we run out of memory;
 *           <ret_dataA> is alloc'ed and filled with sq_block->count CM_ALNDATA objects.
 */
int
DispatchSqBlockAlignment(CM_t *cm, char *errbuf, ESL_SQ_BLOCK *sq_block, float mxsize, ESL_STOPWATCH *w, 
			 ESL_STOPWATCH *w_tot, ESL_RANDOMNESS *r, CM_ALNDATA ***ret_dataA)
{
  int           status;          /* easel status */
  int           j;               /* counter over parsetrees */
  CM_ALNDATA  **dataA = NULL;    /* CM_ALNDATA array we'll create and return */
  ESL_SQ       *sqp;             /* ptr to a ESL_SQ */
  int           pass_idx;        /* pass_idx passed to DispatchSqAlignment() */
  char          mode;            /* mode passed to DispatchSqAlignment() */
  int           cp9b_valid;      /* passed to DispatchSqAlignment() */

  ESL_ALLOC(dataA, sizeof(CM_ALNDATA *) * ESL_MAX(1, sq_block->count)); // avoid 0 malloc
  for(j = 0; j < sq_block->count; j++) dataA[j] = NULL;

  /* DispatchSqAligment() needs a mode, pipeline pass index, and
   * knowledge of whether cm->cp9b are valid for sequence to align
   * (see note in 'Purpose' above). Currently the relevant values 
   * for these are as follows:
   */
  mode       = TRMODE_UNKNOWN;
  pass_idx   = (cm->align_opts & CM_ALIGN_TRUNC) ? PLI_PASS_5P_AND_3P_FORCE : PLI_PASS_STD_ANY; 
  cp9b_valid = FALSE;

  /* main loop: for each sequence, call DispatchSqAlignment() to do the work */
  for(j = 0; j < sq_block->count; j++) { 
    sqp = sq_block->list + j;
    if((status = DispatchSqAlignment(cm, errbuf, sqp, sq_block->first_seqidx + j, mxsize, mode, pass_idx, cp9b_valid, w, w_tot, r, &(dataA[j]))) != eslOK) goto ERROR;
  }
  *ret_dataA = dataA;

  return eslOK;

 ERROR: 
  if(dataA != NULL) { 
    for(j = 0; j < sq_block->count; j++) { 
      if(dataA[j] != NULL) cm_alndata_Destroy(dataA[j], FALSE);
    }
    free(dataA);
  }
  *ret_dataA = NULL;
  if(status == eslEMEM) ESL_FAIL(status, errbuf, "DispatchSqBlockAlignment(), out of memory");
  else return status; /* errbuf was filled by DispatchSqAlignment() */
}
Ejemplo n.º 24
0
/* the pack send/recv buffer must be big enough to hold either an error message or a result vector.
 * it may even grow larger than that, to hold largest HMM we send.
 */
static int
minimum_mpi_working_buffer(ESL_GETOPTS *go, int N, int *ret_wn)
{
  int n;
  int nerr    = 0;
  int nresult = 0;

  /* error packet */
  if (MPI_Pack_size(eslERRBUFSIZE, MPI_CHAR,   MPI_COMM_WORLD, &nerr)!= 0)return eslESYS;   
  
  /* results packet */
  if (MPI_Pack_size(N,             MPI_DOUBLE, MPI_COMM_WORLD, &n)  != 0) return eslESYS;   nresult += n;   /* scores */
  if (esl_opt_GetBoolean(go, "-a")) {
    if (MPI_Pack_size(N,           MPI_INT,    MPI_COMM_WORLD, &n)  != 0) return eslESYS;   nresult += n;   /* alignment lengths */
  }
  if (MPI_Pack_size(1,             MPI_DOUBLE, MPI_COMM_WORLD, &n)  != 0) return eslESYS;   nresult += n*2; /* mu, lambda */

  /* add the shared status code to the max of the two possible kinds of packets */
  *ret_wn =  ESL_MAX(nresult, nerr);
  if (MPI_Pack_size(1,             MPI_INT,    MPI_COMM_WORLD, &n)  != 0) return eslESYS;   *ret_wn += n;   /* status code */
  return eslOK;
}
Ejemplo n.º 25
0
/* set_effective_seqnumber()
 *
 * <hmm> comes in with weighted observed counts. It goes out with
 * those observed counts rescaled to sum to the "effective sequence
 * number". 
 *
 * <msa> is needed because we may need to see the sequences in order 
 * to determine effective seq #. (for --eclust)
 *
 * <prior> is needed because we may need to parameterize test models
 * looking for the right relative entropy. (for --eent, the default)
 */
static int
effective_seqnumber(P7_BUILDER *bld, const ESL_MSA *msa, P7_HMM *hmm, const P7_BG *bg)
{
  int    status;

  if      (bld->effn_strategy == p7_EFFN_NONE)    hmm->eff_nseq = msa->nseq;
  else if (bld->effn_strategy == p7_EFFN_SET)     hmm->eff_nseq = bld->eset;
  else if (bld->effn_strategy == p7_EFFN_CLUST)
    {
      int nclust;

      status = esl_msacluster_SingleLinkage(msa, bld->eid, NULL, NULL, &nclust);
      if      (status == eslEMEM) ESL_XFAIL(status, bld->errbuf, "memory allocation failed");
      else if (status != eslOK)   ESL_XFAIL(status, bld->errbuf, "single linkage clustering algorithm (at %d%% id) failed", (int)(100 * bld->eid));

      hmm->eff_nseq = (double) nclust;
    }

  else if (bld->effn_strategy == p7_EFFN_ENTROPY)
    {
      double etarget; 
      double eff_nseq;

      etarget = (bld->esigma - eslCONST_LOG2R * log( 2.0 / ((double) hmm->M * (double) (hmm->M+1)))) / (double) hmm->M; /* xref J5/36. */
      etarget = ESL_MAX(bld->re_target, etarget);

      status = p7_EntropyWeight(hmm, bg, bld->prior, etarget, &eff_nseq);
      if      (status == eslEMEM) ESL_XFAIL(status, bld->errbuf, "memory allocation failed");
      else if (status != eslOK)   ESL_XFAIL(status, bld->errbuf, "internal failure in entropy weighting algorithm");
      hmm->eff_nseq = eff_nseq;
    }
    
  p7_hmm_Scale(hmm, hmm->eff_nseq / (double) hmm->nseq);
  return eslOK;

 ERROR:
  return status;
}
Ejemplo n.º 26
0
static void
utest_FLogsumError(ESL_GETOPTS *go, ESL_RANDOMNESS *r)
{
  int     N          = esl_opt_GetInteger(go, "-N");
  float   maxval     = esl_opt_GetReal(go, "-S");
  int     be_verbose = esl_opt_GetBoolean(go, "-v");
  float   maxerr = 0.0;
  float   avgerr = 0.0;
  int     i;
  float   a,b,result,exact,err;

  for (i = 0; i < N; i++)
    {
      a = (esl_random(r) - 0.5) * maxval * 2.; /* uniform draws on -maxval..maxval */
      b = (esl_random(r) - 0.5) * maxval * 2.; 

      exact  = log(exp(a) + exp(b));
      result = p7_FLogsum(a,b);
      err    = fabs(exact-result) / maxval;

      avgerr += err;
      maxerr = ESL_MAX(maxerr, err);

      if (be_verbose)
	printf("%8.4f %8.4f %8.4f %8.4f %8.4f\n", a, b, exact, result, err);
    }
  avgerr /= (float) N;

  if (be_verbose) {
    printf("average error = %f\n", avgerr);
    printf("max error     = %f\n", maxerr);
  }

  if (maxerr > 0.0001) esl_fatal("maximum error of %f is too high: logsum unit test fails", maxerr);
  if (avgerr > 0.0001) esl_fatal("average error of %f is too high: logsum unit test fails", avgerr);
}
Ejemplo n.º 27
0
/* Function:  p7_GViterbi()
 * Synopsis:  The Viterbi algorithm.
 * Incept:    SRE, Tue Jan 30 10:50:53 2007 [Einstein's, St. Louis]
 * 
 * Purpose:   The standard Viterbi dynamic programming algorithm. 
 *
 *            Given a digital sequence <dsq> of length <L>, a profile
 *            <gm>, and DP matrix <gx> allocated for at least <L>
 *            by <gm->M> cells; calculate the maximum scoring path by
 *            Viterbi; return the Viterbi score in <ret_sc>, and the
 *            Viterbi matrix is in <gx>.
 *            
 *            The caller may then retrieve the Viterbi path by calling
 *            <p7_GTrace()>.
 *           
 *            The Viterbi lod score is returned in nats. The caller
 *            needs to subtract a null model lod score, then convert
 *            to bits.
 *           
 * Args:      dsq    - sequence in digitized form, 1..L
 *            L      - length of dsq
 *            gm     - profile. 
 *            gx     - DP matrix with room for an MxL alignment
 *            opt_sc - optRETURN: Viterbi lod score in nats
 *           
 * Return:   <eslOK> on success.
 */
int
p7_GViterbi(const ESL_DSQ *dsq, int L, const P7_PROFILE *gm, P7_GMX *gx, float *opt_sc)
{
  float const *tsc  = gm->tsc;
  float      **dp   = gx->dp;
  float       *xmx  = gx->xmx;
  int          M    = gm->M;
  int          i,k;
  float        esc  = p7_profile_IsLocal(gm) ? 0 : -eslINFINITY;

  /* Initialization of the zero row.  */
  XMX(0,p7G_N) = 0;                                           /* S->N, p=1            */
  XMX(0,p7G_B) = gm->xsc[p7P_N][p7P_MOVE];                    /* S->N->B, no N-tail   */
  XMX(0,p7G_E) = XMX(0,p7G_C) = XMX(0,p7G_J) = -eslINFINITY;  /* need seq to get here */
  for (k = 0; k <= gm->M; k++)
    MMX(0,k) = IMX(0,k) = DMX(0,k) = -eslINFINITY;            /* need seq to get here */

  /* DP recursion */
  for (i = 1; i <= L; i++) 
    {
      float const *rsc = gm->rsc[dsq[i]];
      float sc;

      MMX(i,0) = IMX(i,0) = DMX(i,0) = -eslINFINITY;
      XMX(i,p7G_E) = -eslINFINITY;
    
      for (k = 1; k < gm->M; k++) 
	{
  	  /* match state */
	  sc       = ESL_MAX(    MMX(i-1,k-1)   + TSC(p7P_MM,k-1), 
				 IMX(i-1,k-1)   + TSC(p7P_IM,k-1));
	  sc       = ESL_MAX(sc, DMX(i-1,k-1)   + TSC(p7P_DM,k-1));
	  sc       = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,k-1));
	  MMX(i,k) = sc + MSC(k);

	  /* E state update */
	  XMX(i,p7G_E) = ESL_MAX(XMX(i,p7G_E), MMX(i,k) + esc);
	  /* in Viterbi alignments, Dk->E can't win in local mode (and
	   * isn't possible in glocal mode), so don't bother
	   * looking. */

	  /* insert state */
	  sc = ESL_MAX(MMX(i-1,k) + TSC(p7P_MI,k),
		       IMX(i-1,k) + TSC(p7P_II,k));
	  IMX(i,k) = sc + ISC(k);
	  
	  /* delete state */
	  DMX(i,k) =  ESL_MAX(MMX(i,k-1) + TSC(p7P_MD,k-1),
			      DMX(i,k-1) + TSC(p7P_DD,k-1));
	}

      /* Unrolled match state M. */
      sc       = ESL_MAX(    MMX(i-1,M-1)   + TSC(p7P_MM,M-1),
			     IMX(i-1,M-1)   + TSC(p7P_IM,M-1));
      sc       = ESL_MAX(sc, DMX(i-1,M-1 )  + TSC(p7P_DM,M-1));
      sc       = ESL_MAX(sc, XMX(i-1,p7G_B) + TSC(p7P_BM,M-1));
      MMX(i,M) = sc + MSC(M);
      
      /* Unrolled delete state D_M 
       * (Unlike internal Dk->E transitions that can never appear in 
       * Viterbi alignments, D_M->E is possible in glocal mode.)
       */
      DMX(i,M) = ESL_MAX(MMX(i,M-1) + TSC(p7P_MD,M-1),
			 DMX(i,M-1) + TSC(p7P_DD,M-1));

      /* E state update; transition from M_M scores 0 by def'n */
      sc  =          ESL_MAX(XMX(i,p7G_E), MMX(i,M));
      XMX(i,p7G_E) = ESL_MAX(sc,           DMX(i,M));
   
      /* Now the special states. E must already be done, and B must follow N,J.
       * remember, N, C and J emissions are zero score by definition.
       */
      /* J state */
      sc           =             XMX(i-1,p7G_J) + gm->xsc[p7P_J][p7P_LOOP];   /* J->J */
      XMX(i,p7G_J) = ESL_MAX(sc, XMX(i,  p7G_E) + gm->xsc[p7P_E][p7P_LOOP]);  /* E->J is E's "loop" */
      
      /* C state */
      sc           =             XMX(i-1,p7G_C) + gm->xsc[p7P_C][p7P_LOOP];
      XMX(i,p7G_C) = ESL_MAX(sc, XMX(i,  p7G_E) + gm->xsc[p7P_E][p7P_MOVE]);
      
      /* N state */
      XMX(i,p7G_N) = XMX(i-1,p7G_N) + gm->xsc[p7P_N][p7P_LOOP];
      
      /* B state */
      sc           =             XMX(i,p7G_N) + gm->xsc[p7P_N][p7P_MOVE];   /* N->B is N's move */
      XMX(i,p7G_B) = ESL_MAX(sc, XMX(i,p7G_J) + gm->xsc[p7P_J][p7P_MOVE]);  /* J->B is J's move */
    }
  
  /* T state (not stored) */
  if (opt_sc != NULL) *opt_sc = XMX(L,p7G_C) + gm->xsc[p7P_C][p7P_MOVE];
  gx->M = gm->M;
  gx->L = L;
  return eslOK;
}
Ejemplo n.º 28
0
int
main(int argc, char **argv)
{
  ESL_GETOPTS    *go        = NULL;
  char           *seqfile   = NULL;
  ESL_SQFILE     *sqfp      = NULL;
  int             infmt     = eslSQFILE_UNKNOWN;
  int             alphatype = eslUNKNOWN;
  ESL_ALPHABET   *abc       = NULL;
  ESL_SQ         *sq        = NULL;
  int64_t         nseq      = 0;   
  int64_t         nres      = 0;
  int64_t         small     = 0;
  int64_t         large     = 0;
  double         *monoc     = NULL; /* monoresidue composition per sequence  */
  double         *monoc_all = NULL; /* monoresidue composition over all seqs */
  int             do_comp   = FALSE;
  int             status    = eslOK;
  int             wstatus;
  int             i;
  int             x;

  /* Parse command line */
  go = esl_getopts_Create(options);
  if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
  if (esl_opt_VerifyConfig(go)               != eslOK) cmdline_failure(argv[0], "Error in app configuration: %s\n",   go->errbuf);
  if (esl_opt_GetBoolean(go, "-h") )                   cmdline_help(argv[0], go);
  if (esl_opt_ArgNumber(go) != 1)                      cmdline_failure(argv[0], "Incorrect number of command line arguments.\n");

  seqfile = esl_opt_GetArg(go, 1);
  do_comp = esl_opt_GetBoolean(go, "-c");

  if (esl_opt_GetString(go, "--informat") != NULL) {
    infmt = esl_sqio_FormatCode(esl_opt_GetString(go, "--informat"));
    if (infmt == eslSQFILE_UNKNOWN) esl_fatal("%s is not a valid input sequence file format for --informat"); 
  }

  status = esl_sqfile_Open(seqfile, infmt, NULL, &sqfp);
  if      (status == eslENOTFOUND) esl_fatal("No such file %s", seqfile);
  else if (status == eslEFORMAT)   esl_fatal("Format of seqfile %s unrecognized.", seqfile);
  else if (status != eslOK)        esl_fatal("Open failed, code %d.", status);

  if      (esl_opt_GetBoolean(go, "--rna"))   alphatype = eslRNA;
  else if (esl_opt_GetBoolean(go, "--dna"))   alphatype = eslDNA;
  else if (esl_opt_GetBoolean(go, "--amino")) alphatype = eslAMINO;
  else {
    status = esl_sqfile_GuessAlphabet(sqfp, &alphatype);
    if      (status == eslEAMBIGUOUS) esl_fatal("Couldn't guess alphabet from first sequence in %s", seqfile);
    else if (status == eslEFORMAT)    esl_fatal("Sequence file parse error, line %d of file %s:\n%s\n",
					       sqfp->linenumber, seqfile, sqfp->errbuf);
    else if (status == eslENODATA)    esl_fatal("Sequence file %s contains no data?", seqfile);
    else if (status != eslOK)         esl_fatal("Failed to guess alphabet (error code %d)\n", status);
  }
  abc = esl_alphabet_Create(alphatype);
  sq  = esl_sq_CreateDigital(abc);
  esl_sqfile_SetDigital(sqfp, abc);

  if (do_comp) {
    ESL_ALLOC(monoc,     (abc->Kp) * sizeof(double));  
    ESL_ALLOC(monoc_all, (abc->Kp) * sizeof(double));  
    esl_vec_DSet(monoc_all, abc->Kp, 0.0);
    esl_vec_DSet(monoc,     abc->Kp, 0.0);
  }

  while ((wstatus = esl_sqio_ReadWindow(sqfp, 0, 4096, sq)) != eslEOF)
    {
      if (wstatus == eslOK)
	{
	  if (do_comp) 
	    for (i = 1; i <= sq->n; i++) 
	      monoc[sq->dsq[i]]++;
	}
      else if (wstatus == eslEOD) 
	{			
	  if (nseq == 0) { small = large = sq->L; }
	  else {
	    small = ESL_MIN(small, sq->L);
	    large = ESL_MAX(large, sq->L);
	  }

	  if (esl_opt_GetBoolean(go, "-a")) {
	    printf("= %-20s %8" PRId64 " %s\n", sq->name, sq->L, (sq->desc != NULL) ? sq->desc : "");
	  }

	  nres += sq->L;
	  nseq++;
	  esl_sq_Reuse(sq);
	  if (do_comp) {
	    esl_vec_DAdd(monoc_all, monoc, abc->Kp);
	    esl_vec_DSet(monoc, abc->Kp, 0.0);
	  }
	}
      else if (wstatus == eslEFORMAT)
	{
	  esl_fatal("Failed to parse sequence at line %ld, file %s:\n%s", 
		    (long) sqfp->linenumber, sqfp->filename, sqfp->errbuf);
	}
      else 
	esl_fatal("Failed in reading sequence:\n%s\n", sqfp->errbuf);
    }

  printf("Format:              %s\n",   esl_sqio_DescribeFormat(sqfp->format));
  printf("Alphabet type:       %s\n",   esl_abc_DescribeType(abc->type));
  printf("Number of sequences: %" PRId64 "\n", nseq);
  printf("Total # residues:    %" PRId64 "\n", nres);
  printf("Smallest:            %" PRId64 "\n", small);
  printf("Largest:             %" PRId64 "\n", large);
  printf("Average length:      %.1f\n", (float) nres / (float) nseq);

  if (do_comp) {
    printf("\nResidue composition:\n");
    for (x = 0; x < abc->Kp; x++)
      if (x < abc->K || monoc_all[x] > 0)
	printf("residue: %c   %10.0f  %.4f\n", abc->sym[x], monoc_all[x], monoc_all[x] / (double) nres);
    free(monoc);
    free(monoc_all);
  }

  esl_alphabet_Destroy(abc);
  esl_sq_Destroy(sq);
  esl_sqfile_Close(sqfp);
  esl_getopts_Destroy(go);
  return 0;

 ERROR:
  return status;
}
Ejemplo n.º 29
0
/* Function:  p7_ViterbiFilter()
 * Synopsis:  Calculates Viterbi score, vewy vewy fast, in limited precision.
 * Incept:    SRE, Tue Nov 27 09:15:24 2007 [Janelia]
 *
 * Purpose:   Calculates an approximation of the Viterbi score for sequence
 *            <dsq> of length <L> residues, using optimized profile <om>,
 *            and a preallocated one-row DP matrix <ox>. Return the 
 *            estimated Viterbi score (in nats) in <ret_sc>.
 *            
 *            Score may overflow (and will, on high-scoring
 *            sequences), but will not underflow. 
 *            
 *            The model must be in a local alignment mode; other modes
 *            cannot provide the necessary guarantee of no underflow.
 *            
 *            This is a striped SIMD Viterbi implementation using Intel
 *            VMX integer intrinsics \citep{Farrar07}, in reduced
 *            precision (signed words, 16 bits).
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - DP matrix
 *            ret_sc  - RETURN: Viterbi score (in nats)          
 *
 * Returns:   <eslOK> on success;
 *            <eslERANGE> if the score overflows; in this case
 *            <*ret_sc> is <eslINFINITY>, and the sequence can 
 *            be treated as a high-scoring hit.
 *
 * Throws:    <eslEINVAL> if <ox> allocation is too small, or if
 *            profile isn't in a local alignment mode. (Must be in local
 *            alignment mode because that's what helps us guarantee 
 *            limited dynamic range.)
 *
 * Xref:      [Farrar07] for ideas behind striped SIMD DP.
 *            J2/46-47 for layout of HMMER's striped SIMD DP.
 *            J2/50 for single row DP.
 *            J2/60 for reduced precision (epu8)
 *            J2/65 for initial benchmarking
 *            J2/66 for precision maximization
 *            J4/138-140 for reimplementation in 16-bit precision
 */
int
p7_ViterbiFilter(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *ret_sc)
{
  vector signed short mpv, dpv, ipv; /* previous row values                                       */
  vector signed short sv;	     /* temp storage of 1 curr row value in progress              */
  vector signed short dcv;	     /* delayed storage of D(i,q+1)                               */
  vector signed short xEv;	     /* E state: keeps max for Mk->E as we go                     */
  vector signed short xBv;	     /* B state: splatted vector of B[i-1] for B->Mk calculations */
  vector signed short Dmaxv;         /* keeps track of maximum D cell on row                      */
  int16_t  xE, xB, xC, xJ, xN;	     /* special states' scores                                    */
  int16_t  Dmax;		     /* maximum D cell score on row                               */
  int i;			     /* counter over sequence positions 1..L                      */
  int q;			     /* counter over vectors 0..nq-1                              */
  int Q;                             /* segment length: # of vectors                              */
  vector signed short *dp;           /* using {MDI}MX(q) macro requires initialization of <dp>    */
  vector signed short *rsc;	     /* will point at om->ru[x] for residue x[i]                  */
  vector signed short *tsc;	     /* will point into (and step thru) om->tu                    */

  vector signed short negInfv;

  Q = p7O_NQW(om->M);
  dp = ox->dpw[0];

  /* Check that the DP matrix is ok for us. */
  if (Q > ox->allocQ8)                                 ESL_EXCEPTION(eslEINVAL, "DP matrix allocated too small");
  if (om->mode != p7_LOCAL && om->mode != p7_UNILOCAL) ESL_EXCEPTION(eslEINVAL, "Fast filter only works for local alignment");
  ox->M   = om->M;

  negInfv = esl_vmx_set_s16((signed short)-32768);
  
  /* Initialization. In unsigned arithmetic, -infinity is -32768
   */
  for (q = 0; q < Q; q++)
    MMXo(q) = IMXo(q) = DMXo(q) = negInfv;
  xN   = om->base_w;
  xB   = xN + om->xw[p7O_N][p7O_MOVE];
  xJ   = -32768;
  xC   = -32768;
  xE   = -32768;

#if p7_DEBUGGING
  if (ox->debugging) p7_omx_DumpVFRow(ox, 0, xE, 0, xJ, xB, xC); /* first 0 is <rowi>: do header. second 0 is xN: always 0 here. */
#endif

  for (i = 1; i <= L; i++)
    {
      rsc   = om->rwv[dsq[i]];
      tsc   = om->twv;
      dcv   = negInfv;               /* "-infinity" */
      xEv   = negInfv;
      Dmaxv = negInfv;
      xBv   = esl_vmx_set_s16(xB);

      /* Right shifts by 1 value (2 bytes). 4,8,12,x becomes x,4,8,12. 
       * Because ia32 is littlendian, this means a left bit shift.
       * Zeros shift on automatically; replace it with -32768.
       */
      mpv = MMXo(Q-1);  mpv = vec_sld(negInfv, mpv, 14);
      dpv = DMXo(Q-1);  dpv = vec_sld(negInfv, dpv, 14);
      ipv = IMXo(Q-1);  ipv = vec_sld(negInfv, ipv, 14);

      for (q = 0; q < Q; q++)
	{
	  /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
	  sv   =              vec_adds(xBv, *tsc);  tsc++;
	  sv   = vec_max (sv, vec_adds(mpv, *tsc)); tsc++;
	  sv   = vec_max (sv, vec_adds(ipv, *tsc)); tsc++;
	  sv   = vec_max (sv, vec_adds(dpv, *tsc)); tsc++;
	  sv   = vec_adds(sv, *rsc);                rsc++;
	  xEv  = vec_max(xEv, sv);
	  
	  /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
	   * {MDI}MX(q) is then the current, not the prev row
	   */
	  mpv = MMXo(q);
	  dpv = DMXo(q);
	  ipv = IMXo(q);

	  /* Do the delayed stores of {MD}(i,q) now that memory is usable */
	  MMXo(q) = sv;
	  DMXo(q) = dcv;

	  /* Calculate the next D(i,q+1) partially: M->D only;
           * delay storage, holding it in dcv
	   */
	  dcv   = vec_adds(sv, *tsc);  tsc++;
	  Dmaxv = vec_max(dcv, Dmaxv);

	  /* Calculate and store I(i,q) */
	  sv     =             vec_adds(mpv, *tsc);  tsc++;
	  IMXo(q)= vec_max(sv, vec_adds(ipv, *tsc)); tsc++;
	}	  

      /* Now the "special" states, which start from Mk->E (->C, ->J->B) */
      xE = esl_vmx_hmax_s16(xEv);
      if (xE >= 32767) { *ret_sc = eslINFINITY; return eslERANGE; }	/* immediately detect overflow */
      xN = xN + om->xw[p7O_N][p7O_LOOP];
      xC = ESL_MAX(xC + om->xw[p7O_C][p7O_LOOP], xE + om->xw[p7O_E][p7O_MOVE]);
      xJ = ESL_MAX(xJ + om->xw[p7O_J][p7O_LOOP], xE + om->xw[p7O_E][p7O_LOOP]);
      xB = ESL_MAX(xJ + om->xw[p7O_J][p7O_MOVE], xN + om->xw[p7O_N][p7O_MOVE]);
      /* and now xB will carry over into next i, and xC carries over after i=L */

      /* Finally the "lazy F" loop (sensu [Farrar07]). We can often
       * prove that we don't need to evaluate any D->D paths at all.
       *
       * The observation is that if we can show that on the next row,
       * B->M(i+1,k) paths always dominate M->D->...->D->M(i+1,k) paths
       * for all k, then we don't need any D->D calculations.
       * 
       * The test condition is:
       *      max_k D(i,k) + max_k ( TDD(k-2) + TDM(k-1) - TBM(k) ) < xB(i)
       * So:
       *   max_k (TDD(k-2) + TDM(k-1) - TBM(k)) is precalc'ed in om->dd_bound;
       *   max_k D(i,k) is why we tracked Dmaxv;
       *   xB(i) was just calculated above.
       */
      Dmax = esl_vmx_hmax_s16(Dmaxv);
      if (Dmax + om->ddbound_w > xB) 
	{
	  /* Now we're obligated to do at least one complete DD path to be sure. */
	  /* dcv has carried through from end of q loop above */
	  dcv = vec_sld(negInfv, dcv, 14); 
	  tsc = om->twv + 7*Q;	/* set tsc to start of the DD's */
	  for (q = 0; q < Q; q++) 
	    {
	      DMXo(q) = vec_max(dcv, DMXo(q));	
	      dcv     = vec_adds(DMXo(q), *tsc); tsc++;
	    }

	  /* We may have to do up to three more passes; the check
	   * is for whether crossing a segment boundary can improve
	   * our score. 
	   */
	  do {
	    dcv = vec_sld(negInfv, dcv, 14); 
	    tsc = om->twv + 7*Q;	/* set tsc to start of the DD's */
	    for (q = 0; q < Q; q++) 
	      {
		if (! vec_any_gt(dcv, DMXo(q))) break;
		DMXo(q) = vec_max(dcv, DMXo(q));	
		dcv     = vec_adds(DMXo(q), *tsc);   tsc++;
	      }	    
	  } while (q == Q);
	}
      else  /* not calculating DD? then just store the last M->D vector calc'ed.*/
	DMXo(0) = vec_sld(negInfv, dcv, 14);
	  
#if p7_DEBUGGING
      if (ox->debugging) p7_omx_DumpVFRow(ox, i, xE, 0, xJ, xB, xC);   
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T */
  if (xC > -32768) 
    {
      *ret_sc = (float) xC + (float) om->xw[p7O_C][p7O_MOVE] - (float) om->base_w;
      /* *ret_sc += L * om->ncj_roundoff;  see J4/150 for rationale: superceded by -3.0nat approximation*/
      *ret_sc /= om->scale_w;
      *ret_sc -= 3.0; /* the NN/CC/JJ=0,-3nat approximation: see J5/36. That's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ contrib */
    }
  else *ret_sc = -eslINFINITY;
  return eslOK;
}
Ejemplo n.º 30
0
/* Function: p7_null3_score()
 *
 * Purpose:  Calculate a correction (in log_2 odds) to be applied
 *           to a sequence, using a null model based on the
 *           composition of the target sequence.
 *           The null model is constructed /post hoc/ as the
 *           distribution of the target sequence; if the target
 *           sequence is 40% A, 5% C, 5% G, 40% T, then the null
 *           model is (0.4, 0.05, 0.05, 0.4). This function is
 *           based heavily on Infernal's ScoreCorrectionNull3(),
 *           with two important changes:
 *            - it leaves the log2 conversion from NATS to BITS
 *              for the calling function.
 *            - it doesn't include the omega score modifier
 *              (based on prior probability of using the null3
 *              model), again leaving this to the calling function.
 *
 * Args:     abc   - alphabet for hit (only used to get alphabet size)
 *           dsq   - the sequence the hit resides in
 *           tr   - trace of the alignment, used to find the match states
 *                  (non-match chars are ignored in computing freq, not used if NULL)
 *           start - start position of hit in dsq
 *           stop  - end  position of hit in dsq
 *           bg    - background, used for the default null model's emission freq
 *           ret_sc - RETURN: the correction to the score (in NATS);
 *                   caller subtracts this from hit score to get
 *                   corrected score.
 * Return:   void, ret_sc: the log-odds score correction (in NATS).
 */
void
p7_null3_score(const ESL_ALPHABET *abc, const ESL_DSQ *dsq, P7_TRACE *tr, int start, int stop, P7_BG *bg, float *ret_sc)
{
    float score = 0.;
    int status;
    int i;
    float *freq;
    int dir;
    int tr_pos;

    ESL_ALLOC(freq, sizeof(float) * abc->K);
    esl_vec_FSet(freq, abc->K, 0.0);

    /* contract check */
    if(abc == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() alphabet is NULL.%s\n", "");
    if(dsq == NULL) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() dsq alphabet is NULL.%s\n", "");
    if(abc->type != eslRNA && abc->type != eslDNA) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() expects alphabet of RNA or DNA.%s\n", "");

    dir = start < stop ? 1 : -1;

    if (tr != NULL) {
        /* skip the parts of the trace that precede the first match state */
        tr_pos = 2;
        i = start;
        while (tr->st[tr_pos] != p7T_M) {
            if (tr->st[tr_pos] == p7T_N)
                i += dir;
            tr_pos++;
        }

        /* tally frequencies from characters hitting match state*/
        while (tr->st[tr_pos] != p7T_E) {
            if (tr->st[tr_pos] == p7T_M) {
                if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", "");
                esl_abc_FCount(abc, freq, dsq[i], 1.);
            }
            if (tr->st[tr_pos] != p7T_D )
                i += dir;
            tr_pos++;
        }
    } else {
        /* tally frequencies from the full envelope */
        for (i=ESL_MIN(start,stop); i <= ESL_MAX(start,stop); i++)
        {
            if(esl_abc_XIsGap(abc, dsq[i])) esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "in p7_null3_score(), res %d is a gap!%s\n", "");
            esl_abc_FCount(abc, freq, dsq[i], 1.);
        }
    }

    esl_vec_FNorm(freq, abc->K);


    /* now compute score modifier (nats) - note: even with tr!=NULL, this includes the unmatched characters*/
    for (i = 0; i < abc->K; i++)
        score += freq[i]==0 ? 0.0 : esl_logf( freq[i]/bg->f[i] ) * freq[i] * ( (stop-start)*dir +1) ;

    /* Return the correction to the bit score. */
    score = p7_FLogsum(0., score);
    *ret_sc = score;

    return;

ERROR:
    esl_exception(eslEINVAL, FALSE, __FILE__, __LINE__, "p7_null3_score() memory allocation error.%s\n", "");
    return; /* never reached */

}