Example #1
0
double Hmm::getPseudoCounts(PseudoCounts& counts)
{
  double PofObs = obsProb(); // this call includes a forward() call.
  backward();
  //  print();

  // Compute the pseudo counts of transitions, emissions, and initializations
  for (unsigned int t = 0; t<_timeSlots.size(); t++) {
    TimeSlot* ts = _timeSlots[t];
    TimeSlot::iterator it = ts->begin();

    // P(X_t=s|e_1:T) = alpha_s(t)*beta_s(t)/P(e_t+1:T|e_1:t)
    // The value sum below is log P(e_t+1:T|e_1:t)
    vector<double> logprobs;
    for (; it!=ts->end(); it++) {
      logprobs.push_back((*it)->logAlpha()+(*it)->logBeta());
    }
    double sum = sumLogProb(logprobs);

    // add the pseudo counts into counts
    for (it = ts->begin(); it!=ts->end(); it++) {
      HmmNode* node = *it;
      
      //stateCount=P(X_t=s|e_1:T) 
      double stateCount = node->logAlpha()+node->logBeta()-sum; 

      counts.stateCount().add(node->state(), stateCount);
      vector<Transition*>& ins = node->ins();
      unsigned int k;
      for (k = 0; k<ins.size(); k++) {
	Transition* trans = ins[k];
	HmmNode* from = trans->_from;
	double transCount = from->logAlpha()+getTransProb(trans)
	  +getEmitProb(trans)+node->logBeta()-PofObs;
//	cerr << _str2id.getStr(node->state()) << '\t' 
//	     << _str2id.getStr(trans->_obs) << '\t'
//	     << exp(transCount) << endl;
	counts.emitCount().add(node->state(), trans->_obs, transCount);
      }
      vector<Transition*>& outs = node->outs();
      for (k = 0; k<outs.size(); k++) {
	Transition* trans = outs[k];
	HmmNode* to = trans->_to;
	double transCount = node->logAlpha()+getTransProb(trans)
	  +getEmitProb(trans)+to->logBeta()-PofObs;
	counts.transCount().add(node->state(), to->state(), transCount);
      }
    }
  }
  //  counts.print(_str2id);
  return PofObs;
}
Example #2
0
double Hmm::viterbi(vector<Transition*>& path)
{
  // set nodes at time 0 according to initial probabilities.
  TimeSlot* ts = _timeSlots[0];
  HmmNode* init = (*ts)[0];
  init->logAlpha(0);

  // find the best path up to path t;
  for (unsigned int t = 1; t<_timeSlots.size(); t++) {
    ts = _timeSlots[t];
    for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) {
      HmmNode* node = *it;
      vector<Transition*>& ins = node->ins();
      double maxProb = log(0.0);
      Transition* bestTrans = 0;
      for (unsigned int i = 0; i<ins.size(); i++) {
	Transition* trans = ins[i];
	double logProb = trans->_from->logAlpha()+getTransProb(trans)+getEmitProb(trans);
	if (bestTrans==0 || maxProb<logProb) {
	  bestTrans = trans;
	  maxProb = logProb;
	}
      }
      node->logAlpha(maxProb); // store the highest probability in logAlpha
      node->psi(bestTrans); // store the best transition in psi
    }
  }
  // Find the best node at time T. It will be the last node in the best path
  ts = _timeSlots[_timeSlots.size()-1];
  HmmNode* best = 0;
  for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) {
    HmmNode* node = *it;
    if (best==0 || best->logAlpha()<node->logAlpha())
      best = node;
  }

  // retrieve the nodes in the best path
  for (HmmNode* nd = best; nd;) {
    if (nd->psi()) {
      path.push_back(nd->psi());
      nd = nd->psi()->_from;
    }
    else
      nd = 0;
  } 

  // reverse the path
  for (int i = 0, j=path.size()-1; i<j; i++, j--) {
    Transition* tmp = path[i];
    path[i] = path[j];
    path[j] = tmp;
  }
  // Done
  return best->logAlpha();
}
Example #3
0
void Hmm::forward()
{
  // compute forward probabilities at time 0
  TimeSlot* t0 = _timeSlots[0];
  HmmNode* init = (*t0)[0];
  init->logAlpha(0);

  // compute forward probabilities at time t using the alpha values for time t-1
  for (unsigned int t = 1; t<_timeSlots.size(); t++) {
    TimeSlot* ts = _timeSlots[t];
    for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) {
      vector<Transition*>& ins = (*it)->ins();
      vector<double> logProbs(ins.size());
      for (unsigned int i = 0; i<ins.size(); i++) {
	Transition* trans = ins[i];
	double logProb = trans->_from->logAlpha()+getTransProb(trans)+getEmitProb(trans);
	logProbs[i] = logProb;
      }
      (*it)->logAlpha(sumLogProb(logProbs));
    }
  }
}
Example #4
0
static
void precomputePosTransprob(Markov *markov, Dataset *data, TransProb **seqtransprob) {
	for(int ord = 0; ord <= markov->maxorder; ord++) {
		for(int i = 0; i < markov->numseqs; i++) {
			TransProb *transprob = seqtransprob[i];
			for(int j = 0; j < markov->seqlen[i]; j++) {
				if(data->isBadPos[i][j]) {
					markov->posTransprob[ord][i][j] = 0.0;
				}
				else {
					int actualOrder = getActualMarkovOrder(markov->minBeginDependInd, i, j, ord);

					//determine the previous alphabets of the word
					int alphas[MARKOV_ORDER_BOUND + 1];
					for(int a = 0; a <= actualOrder; a++) {
						alphas[a] = data->seqs[i][j - a];
					}

					//compute transition probability
					if(actualOrder == 0) {
						markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], -1, -1, -1, -1, -1);
					}
					else if(actualOrder == 1) {
						markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], -1, -1, -1, -1);
					}
					else if(actualOrder == 2) {
						markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], -1, -1, -1);
					}
					else if(actualOrder == 3) {
						markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], alphas[3], -1, -1);
					}
					else if(actualOrder == 4) {
						markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], alphas[3], alphas[4], -1);
					}
					else if(actualOrder == 5) {
						markov->posTransprob[ord][i][j] = getTransProb(transprob, actualOrder, alphas[0], alphas[1], alphas[2], alphas[3], alphas[4], alphas[5]);
					}
					else {
						printf("Error: invalid markov order at precomputePosTransprob(): %d\n", actualOrder);
						exit(1);
					}
				}
			}
		}
	}
}
Example #5
0
void Hmm::backward()
{
  int T = _timeSlots.size()-1;
  if (T<1) // no observation
    return;
  for (int t = T; t>=0; t--) {
    TimeSlot* ts = _timeSlots[t];
    for (TimeSlot::iterator it = ts->begin(); it!=ts->end(); it++) {
      HmmNode* node = *it;
      if (t==T)
	node->logBeta(0);
      else {
	vector<Transition*>& outs = node->outs();
	vector<double> logProbs(outs.size());
	for (unsigned int i = 0; i<outs.size(); i++) {
	  Transition* trans = outs[i];
	  double logProb = trans->_to->logBeta()+getTransProb(trans)+getEmitProb(trans);
	  logProbs[i] =logProb;
	}
	node->logBeta(sumLogProb(logProbs));
      }
    }
  }
}
Example #6
0
static
void precomputeBgscore(Markov *markov, Dataset *data, int span) {
	if(!markov->validSpan[span]) {
		printf("Error: proposed span is invalid.\n");
		exit(1);
	}

	//initialization
	double **score = markov->bgscore[span];

	for(int i = 0; i < data->numseqs; i++) {
		for(int j= 0; j < data->seqlen[i]; j++) {
			score[i][j] = NAN; //for out-of-bound (j >= seqlen[i] - span + 1)
		}
	}

	//precompute
	if(markov->bgmodel == BG_GIBBSMARKOV) {
		for(int i = 0; i < data->numseqs; i++) {
			for(int j= 0; j < data->seqlen[i] - span + 1; j++) {
				if(!data->isValidSite[span][i][j]) {
					score[i][j] = PINF;
				}
				else {
					score[i][j] = 1.0;
					for(int m = 0; m < span; m++) {
						score[i][j] *= getTransProb(markov,data, i, j+m, 0);
					}
					for(int k = 0; k < markov->maxorder; k++) { //exception from using "<= maxorder". See DEBUG0 check below.
						//If the motif is at the right edge of the forward-strand,
						//then the ratio will be 1.0 because j+span+k is at the reverse-strand.
						if(j+span+k < data->seqlen[i] && !data->isBadPos[i][j+span+k]) {
							score[i][j] *= getTransProb(markov,data, i, j+span+k, 0)
								/ getTransProb(markov, data, i, j+span+k, j+span);
						}
						else {
							break;
						}
					}

					if(DEBUG0) {
						int k = markov->maxorder;
						if(j+span+k < data->seqlen[i] && !data->isBadPos[i][j+span+k] ) {
							double ratio = getTransProb(markov,data, i, j+span+k, 0)
								/ getTransProb(markov, data, i, j+span+k, j+span);
							if(fabs(ratio - 1.0) > 0.000000001) {
								printf("Error: inconsistency in transition probability ratios\n");
								exit(1);
							}
						}
					}
				}
			}
		}
	}
	else if(markov->bgmodel == BG_BIOPRO) {
		for(int i = 0; i < data->numseqs; i++) {
			for(int j= 0; j < data->seqlen[i] - span + 1; j++) {
				if(!data->isValidSite[span][i][j]) {
					score[i][j] = PINF;
				}
				else {
					score[i][j] = 1.0;
					for(int m = 0; m < span; m++) {
						score[i][j] *= getTransProb(markov,data, i, j+m, j);
					}
				}
			}
		}
	}
	else if(markov->bgmodel == BG_MOTIFSAMPLER) {
		for(int i = 0; i < data->numseqs; i++) {
			for(int j= 0; j < data->seqlen[i] - span + 1; j++) {
				if(!data->isValidSite[span][i][j]) {
					score[i][j] = PINF;
				}
				else {
					score[i][j] = 1.0;
					for(int m = 0; m < span; m++) {
						score[i][j] *= getTransProb(markov,data, i, j+m, 0);
					}
				}
			}
		}
	}
	else if(markov->bgmodel == BG_GMEAN) {
		//similar to MOTIF_SAMPLER
		for(int i = 0; i < data->numseqs; i++) {
			for(int j= 0; j < data->seqlen[i] - span + 1; j++) {
				if(!data->isValidSite[span][i][j]) {
					score[i][j] = PINF;
				}
				else {
					score[i][j] = 1.0;
					for(int m = 0; m < span; m++) {
						double score1 = getTransProb(markov,data, i, j+m, 0);
						double score2 = getTransProb(markov, data, i, getConcatPosOfOppStrand(data, i, j+m, 1), 0);

						score[i][j] *= sqrt(score1 * score2);
					}
				}
			}
		}
	}
	else if(markov->bgmodel == BG_AMEAN) {
		//similar to MOTIF_SAMPLER
		for(int i = 0; i < data->numseqs; i++) {
			for(int j= 0; j < data->seqlen[i] - span + 1; j++) {
				if(!data->isValidSite[span][i][j]) {
					score[i][j] = PINF;
				}
				else {
					score[i][j] = 1.0;
					for(int m = 0; m < span; m++) {
						double score1 = getTransProb(markov,data, i, j+m, 0);
						double score2 = getTransProb(markov, data, i, getConcatPosOfOppStrand(data, i, j+m, 1), 0);

						score[i][j] *= (score1 + score2) / 2.0;
					}
				}
			}
		}
	}

	else {
		printf("Error: Invalid background type\n"); exit(1);
	}

}