/* Function: EmitSequence() * Date: SRE, Sun Mar 8 12:28:03 1998 [St. Louis] * * Purpose: Given a model, sample a sequence and/or traceback. * * Args: hmm - the model * ret_dsq - RETURN: generated digitized sequence (pass NULL if unwanted) * ret_L - RETURN: length of generated sequence * ret_tr - RETURN: generated trace (pass NULL if unwanted) * * Returns: void */ void EmitSequence(struct plan7_s *hmm, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr) { struct p7trace_s *tr; enum p7stype type; /* current state type */ int k; /* current node index */ char *dsq; /* generated sequence, digitized */ int L; /* length of sequence */ int alloc_tlen; /* allocated space for traceback */ int alloc_L; /* allocated space for sequence */ int tpos; /* position in traceback */ int sym; /* a generated symbol index */ float t[4]; /* little array for choosing M transition from */ /* Initialize; allocations */ P7AllocTrace(64, &tr); alloc_tlen = 64; dsq = MallocOrDie(sizeof(char) * 64); alloc_L = 64; TraceSet(tr, 0, STS, 0, 0); TraceSet(tr, 1, STN, 0, 0); dsq[0] = (char) Alphabet_iupac; L = 1; k = 0; type = STN; tpos = 2; while (type != STT) { /* Deal with state transition */ switch (type) { case STB: type = STM; k = FChoose(hmm->begin+1, hmm->M) + 1; break; case STI: type = (FChoose(hmm->t[k]+TIM, 2) == 0) ? STM : STI; if (type == STM) k++; break; case STN: type = (FChoose(hmm->xt[XTN], 2) == LOOP) ? STN : STB; k = 0; break; case STE: type = (FChoose(hmm->xt[XTE], 2) == LOOP) ? STJ : STC; k = 0; break; case STC: type = (FChoose(hmm->xt[XTC], 2) == LOOP) ? STC : STT; k = 0; break; case STJ: type = (FChoose(hmm->xt[XTJ], 2) == LOOP) ? STJ : STB; k = 0; break; case STD: if (k < hmm->M) { type = (FChoose(hmm->t[k]+TDM, 2) == 0) ? STM : STD; k++; } else { type = STE; k = 0; } break; case STM: if (k < hmm->M) { FCopy(t, hmm->t[k], 3); t[3] = hmm->end[k]; switch (FChoose(t,4)) { case 0: k++; type = STM; break; case 1: type = STI; break; case 2: k++; type = STD; break; case 3: k=0; type = STE; break; default: Die("never happens"); } } else { k = 0; type = STE; } break; case STT: case STBOGUS: default: Die("can't happen."); } /* Choose a symbol emission, if necessary */ sym = -1; if (type == STM) sym = FChoose(hmm->mat[k], Alphabet_size); else if (type == STI) sym = FChoose(hmm->ins[k], Alphabet_size); else if ((type == STN && tr->statetype[tpos-1] == STN) || (type == STC && tr->statetype[tpos-1] == STC) || (type == STJ && tr->statetype[tpos-1] == STJ)) sym = FChoose(hmm->null, Alphabet_size); /* Add to the traceback; deal with realloc if necessary */ TraceSet(tr, tpos, type, k, (sym != -1) ? L : 0); tpos++; if (tpos == alloc_tlen) { alloc_tlen += 64; P7ReallocTrace(tr, alloc_tlen); } /* Add to the digitized seq; deal with realloc, if necessary */ if (sym != -1) { dsq[L] = (char) sym; L++; if (L+1 == alloc_L) { /* L+1 leaves room for sentinel byte + \0 */ alloc_L += 64; dsq = ReallocOrDie(dsq, sizeof(char) * alloc_L); } } } /* Finish off the trace */ tr->tlen = tpos; /* Finish off the dsq with sentinel byte and null terminator. * Emitted Sequence length is L-1. */ dsq[L] = (char) Alphabet_iupac; dsq[L+1] = '\0'; L--; /* Return */ if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq); if (ret_L != NULL) *ret_L = L; if (ret_tr != NULL) *ret_tr = tr; else P7FreeTrace(tr); return; }
/* Function: EmitConsensusSequence() * Date: SRE, Wed Nov 11 11:08:59 1998 [St. Louis] * * Purpose: Generate a "consensus sequence". For the purposes * of a profile HMM, this is defined as: * - for each node: * - if StateOccupancy() says that M is used * with probability >= 0.5, this M is "consensus". * Then, choose maximally likely residue. * if P>0.5 (protein) or P>0.9 (DNA), make * it upper case; else make it lower case. * - if StateOccupancy() says that I * is used with P >= 0.5, this I is "consensus"; * use it 1/(1-TII) times (its expectation value). * Generate an "x" from each I. * * The function expects that the model is config'ed * by Plan7NakedConfig(): that is, for a single global pass * with no N,C,J involvement. * * * Args: hmm - the model * ret_seq - RETURN: consensus sequence (pass NULL if unwanted) * ret_dsq - RETURN: digitized consensus sequence (pass NULL if unwanted) * ret_L - RETURN: length of generated sequence * ret_tr - RETURN: generated trace (pass NULL if unwanted) * * Returns: void */ void EmitConsensusSequence(struct plan7_s *hmm, char **ret_seq, char **ret_dsq, int *ret_L, struct p7trace_s **ret_tr) { struct p7trace_s *tr; /* RETURN: traceback */ char *dsq, *seq; /* sequence in digitized and undigitized form */ float *mp, *ip, *dp; /* state occupancies from StateOccupancy() */ int nmat, ndel, nins; /* number of matches, deletes, inserts used */ int k; /* counter for nodes */ int tpos; /* position in trace */ int i; /* position in seq (equiv pos in dsq is i+1 */ int x; /* symbol choice (M) or # symbols (I) */ float mthresh; /* >= this, show symbol as upper case */ if (Alphabet_type == hmmAMINO) mthresh = 0.5; else mthresh = 0.9; StateOccupancy(hmm, &mp, &ip, &dp); /* First pass: how many states do we need in the trace? * how long will the sequence be? */ nmat = ndel = nins = 0; for (k = 1; k <= hmm->M; k++) { if (mp[k] >= 0.5) nmat++; else ndel++; if (k < hmm->M && ip[k] >= 0.5) nins += (int) (1.f / (1.f - hmm->t[k][TII])); } /* Allocations */ P7AllocTrace(6 + nmat + ndel + nins, &tr); dsq = MallocOrDie(sizeof(char) * (nmat+nins+3)); seq = MallocOrDie(sizeof(char) * (nmat+nins+1)); /* Main pass. * Construct consensus trace, seq, and dsq. */ TraceSet(tr, 0, STS, 0, 0); TraceSet(tr, 1, STN, 0, 0); TraceSet(tr, 2, STB, 0, 0); dsq[0] = Alphabet_iupac; /* guard byte */ tpos = 3; i = 0; for (k = 1; k <= hmm->M; k++) { if (mp[k] >= 0.5) { x = FMax(hmm->mat[k], Alphabet_size); TraceSet(tr, tpos, STM, k, i+1); seq[i] = Alphabet[x]; dsq[i+1] = x; if (hmm->mat[k][x] < mthresh) seq[i] = tolower((int) seq[i]); i++; tpos++; } else { TraceSet(tr, tpos, STD, k, 0); tpos++; } if (k < hmm->M && ip[k] >= 0.5) { x = (int) (1.f / (1.f - hmm->t[k][TII])); while (x--) { TraceSet(tr, tpos, STI, k, i+1); seq[i] = 'x'; dsq[i+1] = Alphabet_iupac - 1; i++; tpos++; } } } TraceSet(tr, tpos, STE, 0, 0); tpos++; TraceSet(tr, tpos, STC, 0, 0); tpos++; TraceSet(tr, tpos, STT, 0, 0); tpos++; dsq[i+1] = Alphabet_iupac; free(mp); free(ip); free(dp); if (ret_seq != NULL) *ret_seq = seq; else free(seq); if (ret_dsq != NULL) *ret_dsq = dsq; else free(dsq); if (ret_L != NULL) *ret_L = i; if (ret_tr != NULL) *ret_tr = tr; else P7FreeTrace(tr); }