/* Function: MakeSearchHMM() * * Purpose: Convert an HMM (probability form) to integer log-odds * form, for searching/alignment algorithms. * * Args: hmm - probability-form hmm to convert * randomseq - random sequence model * shmm - integer log-odds search HMM to create (pre-allocated) * * Return: (void) */ void MakeSearchHMM(struct hmm_struc *hmm, float *randomseq, struct shmm_s *shmm) { int k, sym, x; float tmp; /* Symbol emission probabilities. * The search model keeps vectors for all 26 letters, for speedy lookup. */ for (sym = 'A'; sym <= 'Z'; sym++) { if (strchr(Alphabet, sym) != NULL) { x = SYMIDX(sym); for (k = 0; k <= hmm->M; k++) { tmp = LOG2(hmm->mat[k].p[x] / randomseq[x]); shmm->m_emit[sym-'A'][k] = (int) (INTSCALE * tmp); /* Inserts are HARDWIRED to give zero score!!! */ shmm->i_emit[sym-'A'][k] = 0; #ifdef SRE_REMOVED tmp = LOG2(hmm->ins[k].p[x] / prior->p[INSERT][x]); shmm->i_emit[sym-'A'][k] = (int) (INTSCALE * tmp); #endif } } else /* degenerate symbols */ { for (k = 0; k <= hmm->M; k++) { shmm->m_emit[sym-'A'][k] = Symscore(sym, hmm->mat[k].p, randomseq); shmm->i_emit[sym-'A'][k] = Symscore(sym, hmm->ins[k].p, randomseq); } } } /* State transition probabilities */ for (k = 0; k <= hmm->M; k++) { tmp = LOG2(hmm->del[k].t[DELETE]); shmm->t[k*9 + Tdd] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->del[k].t[INSERT]); shmm->t[k*9 + Tdi] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->del[k].t[MATCH]); shmm->t[k*9 + Tdm] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->ins[k].t[DELETE]); shmm->t[k*9 + Tid] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->ins[k].t[INSERT]); shmm->t[k*9 + Tii] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->ins[k].t[MATCH]); shmm->t[k*9 + Tim] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->mat[k].t[DELETE]); shmm->t[k*9 + Tmd] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->mat[k].t[INSERT]); shmm->t[k*9 + Tmi] = (int) (INTSCALE * tmp); tmp = LOG2(hmm->mat[k].t[MATCH]); shmm->t[k*9 + Tmm] = (int) (INTSCALE * tmp); } /* Annotation */ shmm->flags = hmm->flags; if (hmm->flags & HMM_REF) strcpy(shmm->ref, hmm->ref); if (hmm->flags & HMM_CS) strcpy(shmm->cs, hmm->cs); }
/* Function: DefaultGeneticCode() * * Purpose: Configure aacode, mapping triplets to amino acids. * Triplet index: AAA = 0, AAC = 1, ... UUU = 63. * AA index: alphabetical: A=0,C=1... Y=19 * Stop codon: -1. * Uses the stdcode1[] global translation table from SQUID. * * Args: aacode - preallocated 0.63 array for genetic code * * Return: (void) */ void DefaultGeneticCode(int *aacode) { int x; for (x = 0; x < 64; x++) { if (*(stdcode1[x]) == '*') aacode[x] = -1; else aacode[x] = SYMIDX(*(stdcode1[x])); } }
/* Function: Symscore() * * Purpose: Given a sequence character x and an hmm containing * probabilities, calculate the log-odds (base 2) score of * the symbol for an emission scoring vector. * * Args: x - the character, 'A'-'Z' * scores - emission probability vector * priors - prior probabilities for log-odds * * Return: the integer log odds score of x given the emission * vector and the priors, scaled up by INTSCALE. */ int Symscore(char x, float *scores, float *priors) { float result; float numer, denom; int x_idx; /* simple case: x is in the alphabet */ if (strchr(Alphabet, x) != NULL) { x_idx = SYMIDX(x); result = LOG2(scores[x_idx] / priors[x_idx]); return (int) (INTSCALE * result); } /* non-simple case: x is not in alphabet, but instead represents * an approved degenerate symbol (for instance, N for A|C|G|T. */ if (Alphabet_type == kAmino) { switch (x) { case 'B': numer = scores[SYMIDX('N')] + scores[SYMIDX('D')]; denom = priors[SYMIDX('N')] + priors[SYMIDX('D')]; break; case 'Z': numer = scores[SYMIDX('Q')] + scores[SYMIDX('E')]; denom = priors[SYMIDX('Q')] + priors[SYMIDX('E')]; break; default: case 'X': numer = denom = 1.0; break; } } else if (Alphabet_type == kDNA || Alphabet_type == kRNA) { switch (x) { /* assumes order "ACGT" */ case 'B': numer = scores[1] + scores[2] + scores[3]; denom = priors[1] + priors[2] + priors[3]; break; case 'D': numer = scores[0] + scores[2] + scores[3]; denom = priors[0] + priors[2] + priors[3]; break; case 'H': numer = scores[0] + scores[1] + scores[3]; denom = priors[0] + priors[1] + priors[3]; break; case 'K': numer = scores[2] + scores[3]; denom = priors[2] + priors[3]; break; case 'M': numer = scores[0] + scores[1]; denom = priors[0] + priors[1]; break; case 'R': numer = scores[0] + scores[2]; denom = priors[0] + priors[2]; break; case 'S': numer = scores[1] + scores[2]; denom = priors[1] + priors[2]; break; case 'V': numer = scores[0] + scores[1] + scores[2]; denom = priors[0] + priors[1] + priors[2]; break; case 'T': case 'U': numer = scores[3]; denom = priors[3]; break; case 'W': numer = scores[0] + scores[3]; denom = priors[0] + priors[3]; break; case 'Y': numer = scores[1] + scores[3]; denom = priors[1] + priors[3]; break; default: case 'N': numer = denom = 1.0; break; } } else { numer = denom = 1.0; } result = LOG2(numer / denom); return (INTSCALE * result); }
/* Function: CountSymbol() * * Given an observed symbol, and a number of counts to * distribute (typically just 1.0), bump the appropriate counter(s). * * This is completely trivial only so long as the symbols * always come from the expected alphabet; since we also * have to deal with degenerate symbols for both nucleic * acid and protein languages, we make a function to deal * with this. * * Returns 1 on success and bumps the necessary counters. * Returns 0 on failure and bumps each counter evenly, as * if it saw a completely ambiguous symbol; this lets * the caller silently accept garbage symbols, if it cares to. */ int CountSymbol(char sym, /* observed symbol */ double wt, /* number of counts to distribute (1.0) */ float *counters) /* array of 4 or 20 counters to increment */ { char *alphptr; /* pointer into symbol in hmm->alphabet */ int status; /* RETURN: status; did we recognize the symbol? */ int i; /* trivial case: symbol is in alphabet */ if ((alphptr = strchr(Alphabet, sym)) != NULL) { counters[alphptr - Alphabet] += wt; return 1; } /* non trivial case: symbol not in alphabet; either degenerate symbol, or it's garbage */ status = 1; if (Alphabet_type == kAmino) { switch (sym) { case 'B': counters[SYMIDX('N')] += wt * 0.5; counters[SYMIDX('D')] += wt * 0.5; break; case 'Z': counters[SYMIDX('Q')] += wt * 0.5; counters[SYMIDX('E')] += wt * 0.5; break; default: Warn("unrecognized character %c (%d) in sequence\n", sym, (int) sym); status = 0; /* break thru to case 'X' */ case 'X': for (i = 0; i < Alphabet_size; i++) counters[i] += wt / (float) Alphabet_size; break; } } else if (Alphabet_type == kDNA || Alphabet_type == kRNA) { /* Deal with IUPAC code degeneracies. WARNING: Expects that the alphabet is "ACGT" or "ACGU"; any other order will break this code! */ switch (sym) { case 'B': counters[1] += wt/3.0; counters[2] += wt/3.0; counters[3] += wt/3.0; break; case 'D': counters[0] += wt/3.0; counters[2] += wt/3.0; counters[3] += wt/3.0; break; case 'H': counters[0] += wt/3.0; counters[1] += wt/3.0; counters[3] += wt/3.0; break; case 'K': counters[2] += wt/2.0; counters[3] += wt/2.0; break; case 'M': counters[0] += wt/2.0; counters[1] += wt/2.0; break; case 'R': counters[0] += wt/2.0; counters[2] += wt/2.0; break; case 'S': counters[1] += wt/2.0; counters[2] += wt/2.0; break; case 'T': counters[3] += wt; break; case 'U': counters[3] += wt; break; case 'V': counters[0] += wt/3.0; counters[1] += wt/3.0; counters[2] += wt/3.0; break; case 'W': counters[0] += wt/2.0; counters[3] += wt/2.0; break; case 'Y': counters[1] += wt/2.0; counters[3] += wt/2.0; break; default: Warn("unrecognized character %c (%d) in sequence\n", sym, (int) sym); status = 0; /* break thru to case 'N' */ case 'N': for (i = 0; i < Alphabet_size; i++) counters[i] += wt / (float) Alphabet_size; break; } } else { status = 0; Warn("unrecognized character %c (%d) in sequence\n", sym, (int) sym); for (i = 0; i < Alphabet_size; i++) counters[i] += wt / (float) Alphabet_size; } return status; }