double Hmm::getPseudoCounts(PseudoCounts& counts) { double PofObs = obsProb(); // this call includes a forward() call. backward(); // print(); // Compute the pseudo counts of transitions, emissions, and initializations for (unsigned int t = 0; t<_timeSlots.size(); t++) { TimeSlot* ts = _timeSlots[t]; TimeSlot::iterator it = ts->begin(); // P(X_t=s|e_1:T) = alpha_s(t)*beta_s(t)/P(e_t+1:T|e_1:t) // The value sum below is log P(e_t+1:T|e_1:t) vector<double> logprobs; for (; it!=ts->end(); it++) { logprobs.push_back((*it)->logAlpha()+(*it)->logBeta()); } double sum = sumLogProb(logprobs); // add the pseudo counts into counts for (it = ts->begin(); it!=ts->end(); it++) { HmmNode* node = *it; //stateCount=P(X_t=s|e_1:T) double stateCount = node->logAlpha()+node->logBeta()-sum; counts.stateCount().add(node->state(), stateCount); vector<Transition*>& ins = node->ins(); unsigned int k; for (k = 0; k<ins.size(); k++) { Transition* trans = ins[k]; HmmNode* from = trans->_from; double transCount = from->logAlpha()+getTransProb(trans) +getEmitProb(trans)+node->logBeta()-PofObs; // cerr << _str2id.getStr(node->state()) << '\t' // << _str2id.getStr(trans->_obs) << '\t' // << exp(transCount) << endl; counts.emitCount().add(node->state(), trans->_obs, transCount); } vector<Transition*>& outs = node->outs(); for (k = 0; k<outs.size(); k++) { Transition* trans = outs[k]; HmmNode* to = trans->_to; double transCount = node->logAlpha()+getTransProb(trans) +getEmitProb(trans)+to->logBeta()-PofObs; counts.transCount().add(node->state(), to->state(), transCount); } } } // counts.print(_str2id); return PofObs; }
double Hmm::viterbi(vector<Transition*>& path) { // set nodes at time 0 according to initial probabilities. TimeSlot* ts = _timeSlots[0]; HmmNode* init = (*ts)[0]; init->logAlpha(0); // find the best path up to path t; for (unsigned int t = 1; t<_timeSlots.size(); t++) { ts = _timeSlots[t]; for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) { HmmNode* node = *it; vector<Transition*>& ins = node->ins(); double maxProb = log(0.0); Transition* bestTrans = 0; for (unsigned int i = 0; i<ins.size(); i++) { Transition* trans = ins[i]; double logProb = trans->_from->logAlpha()+getTransProb(trans)+getEmitProb(trans); if (bestTrans==0 || maxProb<logProb) { bestTrans = trans; maxProb = logProb; } } node->logAlpha(maxProb); // store the highest probability in logAlpha node->psi(bestTrans); // store the best transition in psi } } // Find the best node at time T. It will be the last node in the best path ts = _timeSlots[_timeSlots.size()-1]; HmmNode* best = 0; for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) { HmmNode* node = *it; if (best==0 || best->logAlpha()<node->logAlpha()) best = node; } // retrieve the nodes in the best path for (HmmNode* nd = best; nd;) { if (nd->psi()) { path.push_back(nd->psi()); nd = nd->psi()->_from; } else nd = 0; } // reverse the path for (int i = 0, j=path.size()-1; i<j; i++, j--) { Transition* tmp = path[i]; path[i] = path[j]; path[j] = tmp; } // Done return best->logAlpha(); }
void Hmm::forward() { // compute forward probabilities at time 0 TimeSlot* t0 = _timeSlots[0]; HmmNode* init = (*t0)[0]; init->logAlpha(0); // compute forward probabilities at time t using the alpha values for time t-1 for (unsigned int t = 1; t<_timeSlots.size(); t++) { TimeSlot* ts = _timeSlots[t]; for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) { vector<Transition*>& ins = (*it)->ins(); vector<double> logProbs(ins.size()); for (unsigned int i = 0; i<ins.size(); i++) { Transition* trans = ins[i]; double logProb = trans->_from->logAlpha()+getTransProb(trans)+getEmitProb(trans); logProbs[i] = logProb; } (*it)->logAlpha(sumLogProb(logProbs)); } } }
static void precomputePosTransprob(Markov *markov, Dataset *data, TransProb **seqtransprob) { for(int ord = 0; ord <= markov->maxorder; ord++) { for(int i = 0; i < markov->numseqs; i++) { TransProb *transprob = seqtransprob[i]; for(int j = 0; j < markov->seqlen[i]; j++) { if(data->isBadPos[i][j]) { markov->posTransprob[ord][i][j] = 0.0; } else { int actualOrder = getActualMarkovOrder(markov->minBeginDependInd, i, j, ord); //determine the previous alphabets of the word int alphas[MARKOV_ORDER_BOUND + 1]; for(int a = 0; a <= actualOrder; a++) { alphas[a] = data->seqs[i][j - a]; } //compute transition probability if(actualOrder == 0) { markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], -1, -1, -1, -1, -1); } else if(actualOrder == 1) { markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], -1, -1, -1, -1); } else if(actualOrder == 2) { markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], -1, -1, -1); } else if(actualOrder == 3) { markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], alphas[3], -1, -1); } else if(actualOrder == 4) { markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], alphas[3], alphas[4], -1); } else if(actualOrder == 5) { markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], alphas[3], alphas[4], alphas[5]); } else { printf("Error: invalid markov order at precomputePosTransprob(): %d\n", actualOrder); exit(1); } } } } } }
void Hmm::backward() { int T = _timeSlots.size()-1; if (T<1) // no observation return; for (int t = T; t>=0; t--) { TimeSlot* ts = _timeSlots[t]; for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) { HmmNode* node = *it; if (t==T) node->logBeta(0); else { vector<Transition*>& outs = node->outs(); vector<double> logProbs(outs.size()); for (unsigned int i = 0; i<outs.size(); i++) { Transition* trans = outs[i]; double logProb = trans->_to->logBeta()+getTransProb(trans)+getEmitProb(trans); logProbs[i] =logProb; } node->logBeta(sumLogProb(logProbs)); } } } }
static void precomputeBgscore(Markov *markov, Dataset *data, int span) { if(!markov->validSpan[span]) { printf("Error: proposed span is invalid.\n"); exit(1); } //initialization double **score = markov->bgscore[span]; for(int i = 0; i < data->numseqs; i++) { for(int j= 0; j < data->seqlen[i]; j++) { score[i][j] = NAN; //for out-of-bound (j >= seqlen[i] - span + 1) } } //precompute if(markov->bgmodel == BG_GIBBSMARKOV) { for(int i = 0; i < data->numseqs; i++) { for(int j= 0; j < data->seqlen[i] - span + 1; j++) { if(!data->isValidSite[span][i][j]) { score[i][j] = PINF; } else { score[i][j] = 1.0; for(int m = 0; m < span; m++) { score[i][j] *= getTransProb(markov,data, i, j+m, 0); } for(int k = 0; k < markov->maxorder; k++) { //exception from using "<= maxorder". See DEBUG0 check below. //If the motif is at the right edge of the forward-strand, //then the ratio will be 1.0 because j+span+k is at the reverse-strand. if(j+span+k < data->seqlen[i] && !data->isBadPos[i][j+span+k]) { score[i][j] *= getTransProb(markov,data, i, j+span+k, 0) / getTransProb(markov, data, i, j+span+k, j+span); } else { break; } } if(DEBUG0) { int k = markov->maxorder; if(j+span+k < data->seqlen[i] && !data->isBadPos[i][j+span+k] ) { double ratio = getTransProb(markov,data, i, j+span+k, 0) / getTransProb(markov, data, i, j+span+k, j+span); if(fabs(ratio - 1.0) > 0.000000001) { printf("Error: inconsistency in transition probability ratios\n"); exit(1); } } } } } } } else if(markov->bgmodel == BG_BIOPRO) { for(int i = 0; i < data->numseqs; i++) { for(int j= 0; j < data->seqlen[i] - span + 1; j++) { if(!data->isValidSite[span][i][j]) { score[i][j] = PINF; } else { score[i][j] = 1.0; for(int m = 0; m < span; m++) { score[i][j] *= getTransProb(markov,data, i, j+m, j); } } } } } else if(markov->bgmodel == BG_MOTIFSAMPLER) { for(int i = 0; i < data->numseqs; i++) { for(int j= 0; j < data->seqlen[i] - span + 1; j++) { if(!data->isValidSite[span][i][j]) { score[i][j] = PINF; } else { score[i][j] = 1.0; for(int m = 0; m < span; m++) { score[i][j] *= getTransProb(markov,data, i, j+m, 0); } } } } } else if(markov->bgmodel == BG_GMEAN) { //similar to MOTIF_SAMPLER for(int i = 0; i < data->numseqs; i++) { for(int j= 0; j < data->seqlen[i] - span + 1; j++) { if(!data->isValidSite[span][i][j]) { score[i][j] = PINF; } else { score[i][j] = 1.0; for(int m = 0; m < span; m++) { double score1 = getTransProb(markov,data, i, j+m, 0); double score2 = getTransProb(markov, data, i, getConcatPosOfOppStrand(data, i, j+m, 1), 0); score[i][j] *= sqrt(score1 * score2); } } } } } else if(markov->bgmodel == BG_AMEAN) { //similar to MOTIF_SAMPLER for(int i = 0; i < data->numseqs; i++) { for(int j= 0; j < data->seqlen[i] - span + 1; j++) { if(!data->isValidSite[span][i][j]) { score[i][j] = PINF; } else { score[i][j] = 1.0; for(int m = 0; m < span; m++) { double score1 = getTransProb(markov,data, i, j+m, 0); double score2 = getTransProb(markov, data, i, getConcatPosOfOppStrand(data, i, j+m, 1), 0); score[i][j] *= (score1 + score2) / 2.0; } } } } } else { printf("Error: Invalid background type\n"); exit(1); } }