Ejemplo n.º 1
0
/* Function: MakeSearchHMM()
 * 
 * Purpose:  Convert an HMM (probability form) to integer log-odds 
 *           form, for searching/alignment algorithms.
 *           
 * Args:     hmm       - probability-form hmm to convert
 *           randomseq - random sequence model
 *           shmm      - integer log-odds search HMM to create (pre-allocated)
 *
 * Return:   (void)
 */
void
MakeSearchHMM(struct hmm_struc *hmm, float *randomseq, struct shmm_s *shmm)
{
  int              k, sym, x;
  float            tmp;

  /* Symbol emission probabilities.
   * The search model keeps vectors for all 26 letters, for speedy lookup.
   */
  for (sym = 'A'; sym <= 'Z'; sym++)
    {
      if (strchr(Alphabet, sym) != NULL)
	{
	  x = SYMIDX(sym);
	  for (k = 0; k <= hmm->M; k++)
	    {
	      tmp = LOG2(hmm->mat[k].p[x] / randomseq[x]);
	      shmm->m_emit[sym-'A'][k] = (int) (INTSCALE * tmp);

				/* Inserts are HARDWIRED to give zero score!!! */
	      shmm->i_emit[sym-'A'][k] = 0;
#ifdef SRE_REMOVED
	      tmp = LOG2(hmm->ins[k].p[x] / prior->p[INSERT][x]);
	      shmm->i_emit[sym-'A'][k] = (int) (INTSCALE * tmp);
#endif
	    }
	}
      else		/* degenerate symbols */
	{
	  for (k = 0; k <= hmm->M; k++)
	    {
	      shmm->m_emit[sym-'A'][k] = Symscore(sym, hmm->mat[k].p, randomseq);
	      shmm->i_emit[sym-'A'][k] = Symscore(sym, hmm->ins[k].p, randomseq);
	    }
	}
    }

  /* State transition probabilities
   */
  for (k = 0; k <= hmm->M; k++)
    {
      tmp = LOG2(hmm->del[k].t[DELETE]); shmm->t[k*9 + Tdd] = (int) (INTSCALE * tmp); 
      tmp = LOG2(hmm->del[k].t[INSERT]); shmm->t[k*9 + Tdi] = (int) (INTSCALE * tmp); 
      tmp = LOG2(hmm->del[k].t[MATCH]);  shmm->t[k*9 + Tdm] = (int) (INTSCALE * tmp); 

      tmp = LOG2(hmm->ins[k].t[DELETE]); shmm->t[k*9 + Tid] = (int) (INTSCALE * tmp); 
      tmp = LOG2(hmm->ins[k].t[INSERT]); shmm->t[k*9 + Tii] = (int) (INTSCALE * tmp); 
      tmp = LOG2(hmm->ins[k].t[MATCH]);  shmm->t[k*9 + Tim] = (int) (INTSCALE * tmp); 

      tmp = LOG2(hmm->mat[k].t[DELETE]); shmm->t[k*9 + Tmd] = (int) (INTSCALE * tmp); 
      tmp = LOG2(hmm->mat[k].t[INSERT]); shmm->t[k*9 + Tmi] = (int) (INTSCALE * tmp); 
      tmp = LOG2(hmm->mat[k].t[MATCH]);  shmm->t[k*9 + Tmm] = (int) (INTSCALE * tmp); 
    }

  /* Annotation
   */
  shmm->flags = hmm->flags;
  if (hmm->flags & HMM_REF) strcpy(shmm->ref, hmm->ref);
  if (hmm->flags & HMM_CS)  strcpy(shmm->cs,  hmm->cs);
}
Ejemplo n.º 2
0
/* Function: DefaultGeneticCode()
 * 
 * Purpose:  Configure aacode, mapping triplets to amino acids.
 *           Triplet index: AAA = 0, AAC = 1, ... UUU = 63.
 *           AA index: alphabetical: A=0,C=1... Y=19
 *           Stop codon: -1. 
 *           Uses the stdcode1[] global translation table from SQUID.
 *           
 * Args:     aacode  - preallocated 0.63 array for genetic code
 *                     
 * Return:   (void)
 */
void
DefaultGeneticCode(int *aacode)
{
  int x;

  for (x = 0; x < 64; x++) {
    if (*(stdcode1[x]) == '*') aacode[x] = -1;
    else                       aacode[x] = SYMIDX(*(stdcode1[x]));
  }
}
Ejemplo n.º 3
0
/* Function: Symscore()
 * 
 * Purpose:  Given a sequence character x and an hmm containing
 *           probabilities, calculate the log-odds (base 2) score of
 *           the symbol for an emission scoring vector.
 *           
 * Args:     x      - the character, 'A'-'Z'
 *           scores - emission probability vector
 *           priors - prior probabilities for log-odds
 *                    
 * Return:   the integer log odds score of x given the emission
 *           vector and the priors, scaled up by INTSCALE.                   
 */
int
Symscore(char x, float *scores, float *priors)
{
  float  result;
  float  numer, denom;
  int    x_idx;

				/* simple case: x is in the alphabet */
  if (strchr(Alphabet, x) != NULL) 
    {
      x_idx  = SYMIDX(x);
      result = LOG2(scores[x_idx] / priors[x_idx]);
      return (int) (INTSCALE * result);
    }

  /* non-simple case: x is not in alphabet, but instead represents
   * an approved degenerate symbol (for instance, N for A|C|G|T.
   */
  if (Alphabet_type == kAmino)
    {
      switch (x) {
      case 'B': 
	numer  = scores[SYMIDX('N')] + scores[SYMIDX('D')];
	denom  = priors[SYMIDX('N')] + priors[SYMIDX('D')];
	break;
      case 'Z':
	numer  = scores[SYMIDX('Q')] + scores[SYMIDX('E')];
	denom  = priors[SYMIDX('Q')] + priors[SYMIDX('E')];
	break;
      default:
      case 'X':
	numer = denom = 1.0;
	break;
      }
    }
  else if (Alphabet_type == kDNA || Alphabet_type == kRNA)
    {
      switch (x) {		/* assumes order "ACGT" */
      case 'B':
	numer = scores[1] + scores[2] + scores[3]; 
	denom = priors[1] + priors[2] + priors[3]; 
	break;
      case 'D':	
	numer = scores[0] + scores[2] + scores[3];
	denom = priors[0] + priors[2] + priors[3];
	break;
      case 'H': 
	numer = scores[0] + scores[1] + scores[3];
	denom = priors[0] + priors[1] + priors[3];
	break;
      case 'K': 
	numer = scores[2] + scores[3];
	denom = priors[2] + priors[3];
	break;
      case 'M': 
	numer = scores[0] + scores[1];
	denom = priors[0] + priors[1];
	break;
      case 'R': 
	numer = scores[0] + scores[2];
	denom = priors[0] + priors[2];
	break;
      case 'S': 
	numer = scores[1] + scores[2];
	denom = priors[1] + priors[2];
	break;
      case 'V': 
	numer = scores[0] + scores[1] + scores[2];
	denom = priors[0] + priors[1] + priors[2];
	break;
      case 'T': 
      case 'U':
	numer = scores[3];
	denom = priors[3];
	break;
      case 'W':
	numer = scores[0] + scores[3];
	denom = priors[0] + priors[3];
	break;
      case 'Y':
	numer = scores[1] + scores[3];
	denom = priors[1] + priors[3];
	break;
      default:
      case 'N':
	numer = denom = 1.0;
	break;
      }
    }
  else
    {
      numer = denom = 1.0;
    }

  result = LOG2(numer / denom);
  return (INTSCALE * result);
}
Ejemplo n.º 4
0
/* Function: CountSymbol()
 * 
 * Given an observed symbol, and a number of counts to
 * distribute (typically just 1.0), bump the appropriate counter(s).
 * 
 * This is completely trivial only so long as the symbols
 * always come from the expected alphabet; since we also
 * have to deal with degenerate symbols for both nucleic
 * acid and protein languages, we make a function to deal
 * with this.
 *
 * Returns 1 on success and bumps the necessary counters.
 * Returns 0 on failure and bumps each counter evenly, as
 * if it saw a completely ambiguous symbol; this lets
 * the caller silently accept garbage symbols, if it cares to.
 */
int
CountSymbol(char   sym,		/* observed symbol                        */
	    double wt,          /* number of counts to distribute (1.0)   */
	    float *counters)    /* array of 4 or 20 counters to increment */
{
  char *alphptr;                /* pointer into symbol in hmm->alphabet         */
  int   status;			/* RETURN: status; did we recognize the symbol? */
  int   i;

				/* trivial case: symbol is in alphabet */
  if ((alphptr = strchr(Alphabet, sym)) != NULL)
    {
      counters[alphptr - Alphabet] += wt;
      return 1;
    }
				/* non trivial case: symbol not in alphabet;
				   either degenerate symbol, or it's garbage */
  status = 1;
  if (Alphabet_type == kAmino)
    {
      switch (sym) {
      case 'B': 
	counters[SYMIDX('N')] += wt * 0.5;
	counters[SYMIDX('D')] += wt * 0.5;
	break;
      case 'Z':
	counters[SYMIDX('Q')] += wt * 0.5;
	counters[SYMIDX('E')] += wt * 0.5;
	break;
      default:
	Warn("unrecognized character %c (%d) in sequence\n", sym, (int) sym);
	status = 0;
				/* break thru to case 'X' */
      case 'X':
	for (i = 0; i < Alphabet_size; i++)
	  counters[i] += wt / (float) Alphabet_size;
	break;
      }
    }

  else if (Alphabet_type == kDNA || Alphabet_type == kRNA)
    {
				/* Deal with IUPAC code degeneracies. 
				   WARNING: Expects that the alphabet
				   is "ACGT" or "ACGU"; any other order 
				   will break this code! */
      switch (sym) {
      case 'B': counters[1] += wt/3.0; counters[2] += wt/3.0; counters[3] += wt/3.0; break;
      case 'D': counters[0] += wt/3.0; counters[2] += wt/3.0; counters[3] += wt/3.0; break;
      case 'H': counters[0] += wt/3.0; counters[1] += wt/3.0; counters[3] += wt/3.0; break;
      case 'K': counters[2] += wt/2.0; counters[3] += wt/2.0;                        break;
      case 'M': counters[0] += wt/2.0; counters[1] += wt/2.0;                        break;
      case 'R': counters[0] += wt/2.0; counters[2] += wt/2.0;                        break;
      case 'S': counters[1] += wt/2.0; counters[2] += wt/2.0;                        break;
      case 'T': counters[3] += wt;                                                   break;
      case 'U': counters[3] += wt;                                                   break;
      case 'V': counters[0] += wt/3.0; counters[1] += wt/3.0; counters[2] += wt/3.0; break;
      case 'W': counters[0] += wt/2.0; counters[3] += wt/2.0;                        break;
      case 'Y': counters[1] += wt/2.0; counters[3] += wt/2.0;                        break;
      default:
	Warn("unrecognized character %c (%d) in sequence\n", sym, (int) sym);
	status = 0;
				/* break thru to case 'N' */
      case 'N':
	for (i = 0; i < Alphabet_size; i++)
	  counters[i] += wt / (float) Alphabet_size;
	break;
      }
    }

  else
    {
      status = 0;
      Warn("unrecognized character %c (%d) in sequence\n", sym, (int) sym);
      for (i = 0; i < Alphabet_size; i++)
	counters[i] += wt / (float) Alphabet_size;
    }

  return status;
}