/* Function: P7PriorifyEmissionVector() * * Purpose: Add prior pseudocounts to an observed * emission count vector and renormalize. * * Can return the posterior mixture probabilities * P(q | counts) if ret_mix[MAXDCHLET] is passed. * Else, pass NULL. * * Args: vec - the 4 or 20-long vector of counts to modify * pri - prior data structure * num - pri->mnum or pri->inum; # of mixtures * eq - pri->mq or pri->iq; prior mixture probabilities * e - pri->i or pri->m; Dirichlet components * ret_mix - filled with posterior mixture probabilities, or NULL * * Return: (void) * The counts in vec are changed and normalized to probabilities. */ void P7PriorifyEmissionVector(float *vec, struct p7prior_s *pri, int num, float eq[MAXDCHLET], float e[MAXDCHLET][MAXABET], float *ret_mix) { int x; /* counter over vec */ int q; /* counter over mixtures */ float mix[MAXDCHLET]; /* posterior distribution over mixtures */ float totc; /* total counts */ float tota; /* total alpha terms */ float xi; /* X_i term, Sjolander eq. 41 */ /* Calculate mix[], which is the posterior probability * P(q | n) of mixture component q given the count vector n * * (side effect note: note that an insert vector in a PAM prior * is passed with num = 1, bypassing pam prior code; this means * that inserts cannot be mixture Dirichlets...) * [SRE, 12/24/00: the above comment is cryptic! what the hell does that * mean, inserts can't be mixtures? doesn't seem to be true. it * may mean that in a PAM prior, you can't have a mixture for inserts, * but I don't even understand that. The insert vectors aren't passed * with num=1!!] */ mix[0] = 1.0; if (pri->strategy == PRI_DCHLET && num > 1) { for (q = 0; q < num; q++) { mix[q] = eq[q] > 0.0 ? log(eq[q]) : -999.; mix[q] += Logp_cvec(vec, Alphabet_size, e[q]); } LogNorm(mix, num); /* now mix[q] is P(component_q | n) */ } else if (pri->strategy == PRI_PAM && num > 1) { /* pam prior uses aa frequencies as `P(q|n)' */ for (q = 0; q < Alphabet_size; q++) mix[q] = vec[q]; FNorm(mix, Alphabet_size); } /* Convert the counts to probabilities, following Sjolander (1996) */ totc = FSum(vec, Alphabet_size); for (x = 0; x < Alphabet_size; x++) { xi = 0.0; for (q = 0; q < num; q++) { tota = FSum(e[q], Alphabet_size); xi += mix[q] * (vec[x] + e[q][x]) / (totc + tota); } vec[x] = xi; } FNorm(vec, Alphabet_size); if (ret_mix != NULL) for (q = 0; q < num; q++) ret_mix[q] = mix[q]; }
/* Function: P7PriorifyTransitionVector() * * Purpose: Add prior pseudocounts to transition vector, * which contains three different probability vectors * for m, d, and i. * * Args: t - state transitions, counts: 3 for M, 2 for I, 2 for D. * prior - Dirichlet prior information * tq - prior distribution over Dirichlet components. * (overrides prior->iq[]; used for alternative * methods of conditioning prior on structural data) * * Return: (void) * t is changed, and renormalized -- comes back as * probability vectors. */ void P7PriorifyTransitionVector(float *t, struct p7prior_s *prior, float tq[MAXDCHLET]) { int ts; int q; float mix[MAXDCHLET]; float totm, totd, toti; /* total counts in three transition vecs */ float xi; /* Sjolander's X_i term */ mix[0] = 1.0; /* default is simple one component */ if ((prior->strategy == PRI_DCHLET || prior->strategy == PRI_PAM) && prior->mnum > 1) { for (q = 0; q < prior->tnum; q++) { mix[q] = tq[q] > 0.0 ? log(tq[q]) : -999.; mix[q] += Logp_cvec(t, 3, prior->t[q]); /* 3 match */ mix[q] += Logp_cvec(t+3, 2, prior->t[q]+3); /* 2 insert */ mix[q] += Logp_cvec(t+5, 2, prior->t[q]+5); /* 2 delete */ } LogNorm(mix, prior->tnum); /* mix[q] is now P(q | counts) */ } /* precalc some denominators */ totm = FSum(t,3); toti = t[TIM] + t[TII]; totd = t[TDM] + t[TDD]; for (ts = 0; ts < 7; ts++) { xi = 0.0; for (q = 0; q < prior->tnum; q++) { switch (ts) { case TMM: case TMI: case TMD: xi += mix[q] * (t[ts] + prior->t[q][ts]) / (totm + FSum(prior->t[q], 3)); break; case TIM: case TII: xi += mix[q] * (t[ts] + prior->t[q][ts]) / (toti + prior->t[q][TIM] + prior->t[q][TII]); break; case TDM: case TDD: xi += mix[q] * (t[ts] + prior->t[q][ts]) / (totd + prior->t[q][TDM] + prior->t[q][TDD]); break; } } t[ts] = xi; } FNorm(t, 3); /* match */ FNorm(t+3, 2); /* insert */ FNorm(t+5, 2); /* delete */ }
/* Function: StrMarkov0() * Date: SRE, Fri Oct 29 11:08:31 1999 [St. Louis] * * Purpose: Returns a random string s1 with the same * length and zero-th order Markov properties * as s2. * * s1 and s2 may be identical, to randomize s2 * in place. * * Args: s1 - allocated space for random string * s2 - string to base s1's properties on. * * Returns: 1 on success; 0 if s2 doesn't look alphabetical. */ int StrMarkov0(char *s1, char *s2) { int len; int pos; float p[26]; /* symbol probabilities */ /* First, verify that the string is entirely alphabetic. */ len = strlen(s2); for (pos = 0; pos < len; pos++) if (! isalpha(s2[pos])) return 0; /* Collect zeroth order counts and convert to frequencies. */ FSet(p, 26, 0.); for (pos = 0; pos < len; pos++) p[(int)(toupper(s2[pos]) - 'A')] += 1.0; FNorm(p, 26); /* Generate a random string using those p's. */ for (pos = 0; pos < len; pos++) s1[pos] = FChoose(p, 26) + 'A'; s1[pos] = '\0'; return 1; }
/* Function: Renormalize() * * Normalize all P distributions so they sum to 1. * P distributions that are all 0, or contain negative * probabilities, are left untouched. * * Returns 1 on success, or 0 on failure. */ void Renormalize(struct hmm_struc *hmm) { int k; /* counter for states */ for (k = 0; k <= hmm->M ; k++) { /* match state transition frequencies */ FNorm(hmm->mat[k].t, 3); FNorm(hmm->ins[k].t, 3); if (k > 0) FNorm(hmm->del[k].t, 3); if (k > 0) FNorm(hmm->mat[k].p, Alphabet_size); FNorm(hmm->ins[k].p, Alphabet_size); } }
/* Function: PAMPrior() * * Purpose: Produces an ad hoc "Dirichlet mixture" prior for * match emissions, using a PAM matrix. * * Side effect notice: PAMPrior() replaces the match * emission section of an existing Dirichlet prior, * which is /expected/ to be a simple one-component * kind of prior. The insert emissions /must/ be a * one-component prior (because of details in how * PriorifyEmissionVector() is done). However, * the transitions /could/ be a mixture Dirichlet prior * without causing problems. In other words, the * -p and -P options of hmmb can coexist, but there * may be conflicts. PAMPrior() checks for these, * so there's no serious problem, except that the * error message from PAMPrior() might be confusing to * a user. */ void PAMPrior(char *pamfile, struct p7prior_s *pri, float wt) { FILE *fp; char *blastpamfile; /* BLAST looks in aa/ subdirectory of BLASTMAT */ int **pam; float scale; int xi, xj; int idx1, idx2; if (Alphabet_type != hmmAMINO) Die("PAM prior is only valid for protein sequences"); if (pri->strategy != PRI_DCHLET) Die("PAM prior may only be applied over an existing Dirichlet prior"); if (pri->inum != 1) Die("PAM prior requires that the insert emissions be a single Dirichlet"); if (MAXDCHLET < 20) Die("Whoa, code is misconfigured; MAXDCHLET must be >= 20 for PAM prior"); blastpamfile = FileConcat("aa", pamfile); if ((fp = fopen(pamfile, "r")) == NULL && (fp = EnvFileOpen(pamfile, "BLASTMAT", NULL)) == NULL && (fp = EnvFileOpen(blastpamfile, "BLASTMAT", NULL)) == NULL) Die("Failed to open PAM scoring matrix file %s", pamfile); if (! ParsePAMFile(fp, &pam, &scale)) Die("Failed to parse PAM scoring matrix file %s", pamfile); fclose(fp); free(blastpamfile); pri->strategy = PRI_PAM; pri->mnum = 20; /* Convert PAM entries back to conditional prob's P(xj | xi), * which we'll use as "pseudocounts" weighted by wt. */ for (xi = 0; xi < Alphabet_size; xi++) for (xj = 0; xj < Alphabet_size; xj++) { idx1 = Alphabet[xi] - 'A'; idx2 = Alphabet[xj] - 'A'; pri->m[xi][xj] = aafq[xj] * exp((float) pam[idx1][idx2] * scale); } /* Normalize so that rows add up to wt. * i.e. Sum(xj) mat[xi][xj] = wt for every row xi */ for (xi = 0; xi < Alphabet_size; xi++) { pri->mq[xi] = 1. / Alphabet_size; FNorm(pri->m[xi], Alphabet_size); FScale(pri->m[xi], Alphabet_size, wt); } Free2DArray((void **)pam,27); }
/* Function: StrMarkov1() * Date: SRE, Fri Oct 29 11:22:20 1999 [St. Louis] * * Purpose: Returns a random string s1 with the same * length and first order Markov properties * as s2. * * s1 and s2 may be identical, to randomize s2 * in place. * * Args: s1 - allocated space for random string * s2 - string to base s1's properties on. * * Returns: 1 on success; 0 if s2 doesn't look alphabetical. */ int StrMarkov1(char *s1, char *s2) { int len; int pos; int x,y; int i; /* initial symbol */ float p[26][26]; /* symbol probabilities */ /* First, verify that the string is entirely alphabetic. */ len = strlen(s2); for (pos = 0; pos < len; pos++) if (! isalpha(s2[pos])) return 0; /* Collect first order counts and convert to frequencies. */ for (x = 0; x < 26; x++) FSet(p[x], 26, 0.); i = x = toupper(s2[0]) - 'A'; for (pos = 1; pos < len; pos++) { y = toupper(s2[pos]) - 'A'; p[x][y] += 1.0; x = y; } for (x = 0; x < 26; x++) FNorm(p[x], 26); /* Generate a random string using those p's. */ x = i; s1[0] = x + 'A'; for (pos = 1; pos < len; pos++) { y = FChoose(p[x], 26); s1[pos] = y + 'A'; x = y; } s1[pos] = '\0'; return 1; }
/* Function: Plan7Renormalize() * * Purpose: Take an HMM in counts form, and renormalize * all of its probability vectors. Also enforces * Plan7 restrictions on nonexistent transitions. * * Args: hmm - the model to renormalize. * * Return: (void) * hmm is changed. */ void Plan7Renormalize(struct plan7_s *hmm) { int k; /* counter for model position */ int st; /* counter for special states */ float d; /* denominator */ /* match emissions */ for (k = 1; k <= hmm->M; k++) FNorm(hmm->mat[k], Alphabet_size); /* insert emissions */ for (k = 1; k < hmm->M; k++) FNorm(hmm->ins[k], Alphabet_size); /* begin transitions */ d = FSum(hmm->begin+1, hmm->M) + hmm->tbd1; FScale(hmm->begin+1, hmm->M, 1./d); hmm->tbd1 /= d; /* main model transitions */ for (k = 1; k < hmm->M; k++) { d = FSum(hmm->t[k], 3) + hmm->end[k]; FScale(hmm->t[k], 3, 1./d); hmm->end[k] /= d; FNorm(hmm->t[k]+3, 2); /* insert */ FNorm(hmm->t[k]+5, 2); /* delete */ } /* null model emissions */ FNorm(hmm->null, Alphabet_size); /* special transitions */ for (st = 0; st < 4; st++) FNorm(hmm->xt[st], 2); /* enforce nonexistent transitions */ /* (is this necessary?) */ hmm->t[0][TDM] = hmm->t[0][TDD] = 0.0; hmm->flags &= ~PLAN7_HASBITS; /* clear the log-odds ready flag */ hmm->flags |= PLAN7_HASPROB; /* set the probabilities OK flag */ }
//----------------------------------------------------------------------------- // TestMatrixFNorm //----------------------------------------------------------------------------- bool TestMatrixFNorm() { Matrix A("1,2,3;4,5,6;7,8,9"); return ApproxEqual( FNorm(A), 16.8819430161341, TOLERANCE); }