LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & /*outState */) const
{

  static WidMatrix widMatrix;

  for (int i=0; i<contextFactor.size(); i++)
    ::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex));


  for (size_t i = 0; i < contextFactor.size(); i++) {
    const Word &word = *contextFactor[i];

    for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) {
      const Factor *factor = word[ m_factorTypesOrdered[j] ];

      if (factor == NULL)
        widMatrix[i][j + 1] = 0;
      else
        widMatrix[i][j + 1] = GetLmID(factor, j);
    }

    if (widMatrix[i][1] == GetLmID(m_sentenceStartArray[0], 0) ) {
      widMatrix[i][0] = m_wtbid;
    } else if (widMatrix[i][1] == GetLmID(m_sentenceEndArray[0], 0 )) {
      widMatrix[i][0] = m_wteid;
    } else {
      widMatrix[i][0] = m_wtid;
    }
  }


  LMResult ret;
  ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() );
  ret.score = FloorScore(TransformLMScore(ret.score));
  ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId);
  return ret;

  /*if (contextFactor.size() == 0)
  {
  	return 0;
  }

  for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos )
  {
  	const Word &word = *contextFactor[currPos];

  	for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index)
  	{
  		FactorType factorType = m_factorTypesOrdered[index];
  		const Factor *factor = word[factorType];

  		(*widMatrix)[currPos][index] = GetLmID(factor, index);

  	}

  }

  float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder );
  return FloorScore(TransformLMScore(p)); */
}
Пример #2
0
LMResult LanguageModelSRI::GetValueBF(const vector< std::string> contextFactor, bool ISfinale) const
{

  LMResult ret;
  FactorType  factorType = GetFactorType();
  size_t count = contextFactor.size();
  if (count <= 0) {
    ret.score = 0.0;
    ret.unknown = false;
    return ret;
  }
  VocabIndex ngram[count + 1];
  for (size_t i = 0 ; i < count - 1 ; i++) {
     ngram[i+1] =  GetLmID(contextFactor[count-2-i]);
  }
  ngram[count] = Vocab_None;

  VocabIndex lmId = GetLmID(contextFactor[count-1]);
  ret = GetValue(lmId, ngram+1);

  if (ISfinale) {
    ngram[0] = lmId;
    unsigned int dummy;
  }
  return ret;

}
Пример #3
0
LMResult LanguageModelSRI::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
  LMResult ret;
  FactorType	factorType = GetFactorType();
  size_t count = contextFactor.size();
  if (count <= 0) {
    if(finalState)
      *finalState = NULL;
    ret.score = 0.0;
    ret.unknown = false;
    return ret;
  }

  // set up context
  VocabIndex ngram[count + 1];
  for (size_t i = 0 ; i < count - 1 ; i++) {
    ngram[i+1] =  GetLmID((*contextFactor[count-2-i])[factorType]);
  }
  ngram[count] = Vocab_None;

  CHECK((*contextFactor[count-1])[factorType] != NULL);
  // call sri lm fn
  VocabIndex lmId = GetLmID((*contextFactor[count-1])[factorType]);
  ret = GetValue(lmId, ngram+1);

  if (finalState) {
    ngram[0] = lmId;
    unsigned int dummy;
    *finalState = m_srilmModel->contextID(ngram, dummy);
  }
  return ret;
}
Пример #4
0
float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const
{
	unsigned int dummy;
	if (!len) { len = &dummy; }
	FactorType factorType = GetFactorType();

	// set up context
	size_t count = contextFactor.size();
    
	m_lmtb_ng->size=0;
	if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd);
	if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart);  

	for (size_t i = 0 ; i < count ; i++)
	{
	  //int lmId = GetLmID((*contextFactor[i])[factorType]);
#ifdef DEBUG
	  cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n";
#endif
	  int lmId = GetLmID((*contextFactor[i])[factorType]->GetString());
	  //	  cerr << (*contextFactor[i])[factorType]->GetString() << " = " << lmId;
	  m_lmtb_ng->pushc(lmId);
	}
  
	if (finalState){        
		*finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng);	
		// back off stats not currently available
		*len = 0;	
	}

	float prob = m_lmtb->clprob(*m_lmtb_ng);
  
  
	return TransformLMScore(prob);
}
float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
	FactorType factorType = GetFactorType();

	// set up context
	size_t count = contextFactor.size();
        if (count < 0) { cerr << "ERROR count < 0\n"; exit(100); };

        // set up context
        int codes[MAX_NGRAM_SIZE];

	size_t idx=0;
        //fill the farthest positions with at most ONE sentenceEnd symbol and at most ONE sentenceEnd symbol, if "empty" positions are available
	//so that the vector looks like = "</s> <s> context_word context_word" for a two-word context and a LM of order 5
	if (count < (size_t) (m_lmtb_size-1)) codes[idx++] = m_lmtb_sentenceEnd;  
	if (count < (size_t) m_lmtb_size) codes[idx++] = m_lmtb_sentenceStart;  

        for (size_t i = 0 ; i < count ; i++)
                codes[idx++] =  GetLmID((*contextFactor[i])[factorType]);

        float prob;
        char* msp = NULL;
        unsigned int ilen;
        prob = m_lmtb->clprob(codes,idx,NULL,NULL,&msp,&ilen);

	if (finalState) *finalState=(State *) msp;

	return TransformLMScore(prob);
}
Пример #6
0
LMResult LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
  // set up context
  size_t count = contextFactor.size();
  if (count < 0) {
    cerr << "ERROR count < 0\n";
    exit(100);
  };

  // set up context
  int codes[MAX_NGRAM_SIZE];

  size_t idx=0;
  //fill the farthest positions with at most ONE sentenceEnd symbol and at most ONE sentenceEnd symbol, if "empty" positions are available
  //so that the vector looks like = "</s> <s> context_word context_word" for a two-word context and a LM of order 5
  if (count < (size_t) (m_lmtb_size-1)) codes[idx++] = m_lmtb_sentenceEnd;
  if (count < (size_t) m_lmtb_size) codes[idx++] = m_lmtb_sentenceStart;

  for (size_t i = 0 ; i < count ; i++) {
    codes[idx] =  GetLmID(*contextFactor[i]);
    ++idx;
  }

  LMResult result;
  result.unknown = (codes[idx - 1] == m_unknownId);

  char* msp = NULL;
  result.score = m_lmtb->clprob(codes,idx,NULL,NULL,&msp);

  if (finalState) *finalState=(State *) msp;

  result.score = TransformLMScore(result.score);

  return result;
}
Пример #7
0
void LanguageModelIRST::CreateFactors(FactorCollection &factorCollection)
{ // add factors which have srilm id
	// code copied & paste from SRI LM class. should do template function
	std::map<size_t, int> lmIdMap;
	size_t maxFactorId = 0; // to create lookup vector later on
	
	dict_entry *entry;
	dictionary_iter iter(m_lmtb->getDict()); // at the level of micro tags
	while ( (entry = iter.next()) != NULL)
	{
		size_t factorId = factorCollection.AddFactor(Output, m_factorType, entry->word)->GetId();
		lmIdMap[factorId] = entry->code;
		maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
	}
	
	size_t factorId;
	
	m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
	factorId = m_sentenceStart->GetId();
	m_lmtb_sentenceStart=lmIdMap[factorId] = GetLmID(BOS_);
	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
	m_sentenceStartArray[m_factorType] = m_sentenceStart;

	m_sentenceEnd		= factorCollection.AddFactor(Output, m_factorType, EOS_);
	factorId = m_sentenceEnd->GetId();
	m_lmtb_sentenceEnd=lmIdMap[factorId] = GetLmID(EOS_);
	maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
	m_sentenceEndArray[m_factorType] = m_sentenceEnd;
	
	// add to lookup vector in object
	m_lmIdLookup.resize(maxFactorId+1);
	
	fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);

	map<size_t, int>::iterator iterMap;
	for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
	{
		m_lmIdLookup[iterMap->first] = iterMap->second;
	}
  
  
}
Пример #8
0
void LanguageModelSRI::CreateFactors()
{
  // add factors which have srilm id
  FactorCollection &factorCollection = FactorCollection::Instance();

  std::map<size_t, VocabIndex> lmIdMap;
  size_t maxFactorId = 0; // to create lookup vector later on

  VocabString str;
  VocabIter iter(*m_srilmVocab);
  while ( (str = iter.next()) != NULL) {
    VocabIndex lmId = GetLmID(str);
    size_t factorId = factorCollection.AddFactor(Output, m_factorType, str)->GetId();
    lmIdMap[factorId] = lmId;
    maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
  }

  size_t factorId;

  m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_);
  factorId = m_sentenceStart->GetId();
  lmIdMap[factorId] = GetLmID(BOS_);
  maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
  m_sentenceStartWord[m_factorType] = m_sentenceStart;

  m_sentenceEnd		= factorCollection.AddFactor(Output, m_factorType, EOS_);
  factorId = m_sentenceEnd->GetId();
  lmIdMap[factorId] = GetLmID(EOS_);
  maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
  m_sentenceEndWord[m_factorType] = m_sentenceEnd;

  // add to lookup vector in object
  m_lmIdLookup.resize(maxFactorId+1);

  fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), m_unknownId);

  map<size_t, VocabIndex>::iterator iterMap;
  for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) {
    m_lmIdLookup[iterMap->first] = iterMap->second;
  }
}
Пример #9
0
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, State* finalState) const
{
	float score;
	const NGramNode *nGram[2];

	nGram[1]		= GetLmID(factor1);
	if (nGram[1] == NULL)
	{
		if (finalState != NULL)
			*finalState = NULL;
		score = -numeric_limits<float>::infinity();
	}
	else
	{
		nGram[0] = nGram[1]->GetNGram(factor0);
		if (nGram[0] == NULL)
		{ // something unigram
			if (finalState != NULL)
				*finalState = static_cast<const void*>(nGram[1]);
			
			nGram[0]	= GetLmID(factor0);
			if (nGram[0] == NULL)
			{ // stops at unigram
				score = nGram[1]->GetScore();
			}
			else
			{	// unigram unigram
				score = nGram[1]->GetScore() + nGram[0]->GetLogBackOff();
			}
		}
		else
		{ // bigram
			if (finalState != NULL)
				*finalState = static_cast<const void*>(nGram[0]);
			score			= nGram[0]->GetScore();
		}
	}

	return FloorScore(score);

}
Пример #10
0
void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
  fullScore = 0;
  ngramScore = 0;
  oovCount = 0;

  if ( !phrase.GetSize() ) return;

  int _min = min(m_lmtb_size - 1, (int) phrase.GetSize());

  int codes[m_lmtb_size];
  int idx = 0;
  codes[idx] = m_lmtb_sentenceStart;
  ++idx;
  int position = 0;

  char* msp = NULL;
  float before_boundary = 0.0;
  for (; position < _min; ++position) {
    codes[idx] = GetLmID(phrase.GetWord(position));
    if (codes[idx] == m_unknownId) ++oovCount;
    before_boundary += m_lmtb->clprob(codes,idx+1,NULL,NULL,&msp);
    ++idx;
  }

  ngramScore = 0.0;
  int end_loop = (int) phrase.GetSize();

  for (; position < end_loop; ++position) {
    for (idx = 1; idx < m_lmtb_size; ++idx) {
      codes[idx-1] = codes[idx];
    }
    codes[idx-1] = GetLmID(phrase.GetWord(position));
    if (codes[idx-1] == m_unknownId) ++oovCount;
    ngramScore += m_lmtb->clprob(codes,idx,NULL,NULL,&msp);
  }
  before_boundary = TransformLMScore(before_boundary);
  ngramScore = TransformLMScore(ngramScore);
  fullScore = ngramScore + before_boundary;
}
Пример #11
0
float LanguageModelInternal::GetValue(const Factor *factor0, State* finalState) const
{
	float prob;
	const NGramNode *nGram		= GetLmID(factor0);
	if (nGram == NULL)
	{
		if (finalState != NULL)
			*finalState = NULL;
		prob = -numeric_limits<float>::infinity();
	}
	else
	{
		if (finalState != NULL)
			*finalState = static_cast<const void*>(nGram);
		prob = nGram->GetScore();
	}
	return FloorScore(prob);
}
Пример #12
0
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
                                       State* finalState) const
{
    FactorType factorType = GetFactorType();
    // set up context
    randlm::WordID ngram[MAX_NGRAM_SIZE];
    int count = contextFactor.size();
    for (int i = 0 ; i < count ; i++) {
        ngram[i] = GetLmID((*contextFactor[i])[factorType]);
        //std::cerr << m_lm->getWord(ngram[i]) << " ";
    }
    int found = 0;
    LMResult ret;
    ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
    ret.unknown = count && (ngram[count - 1] == m_oov_id);
    //if (finalState)
    //  std::cerr << " = " << logprob << "(" << *finalState << ", " <<")"<< std::endl;
    //else
    //  std::cerr << " = " << logprob << std::endl;
    return ret;
}
void LanguageModelParallelBackoff::CreateFactors()
{

  // add factors which have srilm id
  FactorCollection &factorCollection = FactorCollection::Instance();

  lmIdMap = new std::map<size_t, VocabIndex>();


  VocabString str;
  VocabIter iter(*m_srilmVocab);

  iter.init();

  size_t pomFactorTypeNum = 0;


  while ( (str = iter.next()) != NULL) {

    if ((str[0] < 'a' || str[0] > 'k') && str[0] != 'W') {
      continue;
    }
    VocabIndex lmId = GetLmID(str);
    pomFactorTypeNum = str[0] - 'a';

    size_t factorId = factorCollection.AddFactor(Output, m_factorTypesOrdered[pomFactorTypeNum], &(str[2]) )->GetId();
    (*lmIdMap)[factorId * 10 + pomFactorTypeNum] = lmId;
  }

  size_t factorIdStart;
  size_t factorIdEnd;

  // sentence markers
  for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) {
    FactorType factorType = m_factorTypesOrdered[index];
    m_sentenceStartArray[index] 	= factorCollection.AddFactor(Output, factorType, BOS_);


    m_sentenceEndArray[index] 		= factorCollection.AddFactor(Output, factorType, EOS_);

    factorIdStart = m_sentenceStartArray[index]->GetId();
    factorIdEnd = m_sentenceEndArray[index]->GetId();

    /*for (size_t i = 0; i < 10; i++)
    {
      lmIdMap[factorIdStart * 10 + i] = GetLmID(BOS_);
    	lmIdMap[factorIdEnd * 10 + i] = GetLmID(EOS_);
    }*/

    (*lmIdMap)[factorIdStart * 10 + index] = GetLmID(BOS_);
    (*lmIdMap)[factorIdEnd * 10 + index] = GetLmID(EOS_);

    cerr << "BOS_:" << GetLmID(BOS_) << ", EOS_:" << GetLmID(EOS_) << endl;

  }

  m_wtid = GetLmID("W-<unk>");
  m_wtbid = GetLmID("W-<s>");
  m_wteid = GetLmID("W-</s>");

  cerr << "W-<unk> index: " << m_wtid << endl;
  cerr << "W-<s> index: " << m_wtbid << endl;
  cerr << "W-</s> index: " << m_wteid << endl;


}
Пример #14
0
FFState* LanguageModelIRST::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
{
  if (!hypo.GetCurrTargetLength()) {
    std::auto_ptr<IRSTLMState> ret(new IRSTLMState(ps));
    return ret.release();
  }

  //[begin, end) in STL-like fashion.
  const int begin = (const int) hypo.GetCurrTargetWordsRange().GetStartPos();
  const int end = (const int) hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
  const int adjust_end = (const int) std::min(end, begin + m_lmtb_size - 1);

  //set up context
  //fill the farthest positions with sentenceStart symbols, if "empty" positions are available
  //so that the vector looks like = "<s> <s> context_word context_word" for a two-word context and a LM of order 5
  int codes[m_lmtb_size];
  int idx=m_lmtb_size-1;
  int position = (const int) begin;
  while (position >= 0) {
    codes[idx] =  GetLmID(hypo.GetWord(position));
    --idx;
    --position;
  }
  while (idx>=0) {
    codes[idx] = m_lmtb_sentenceStart;
    --idx;
  }

  char* msp = NULL;
  float score = m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp);

  position = (const int) begin+1;
  while (position < adjust_end) {
    for (idx=1; idx<m_lmtb_size; idx++) {
      codes[idx-1] = codes[idx];
    }
    codes[idx-1] =  GetLmID(hypo.GetWord(position));
    score += m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp);
    ++position;
  }

  //adding probability of having sentenceEnd symbol, after this phrase;
  //this could happen only when all source words are covered
  if (hypo.IsSourceCompleted()) {
    idx=m_lmtb_size-1;
    codes[idx] = m_lmtb_sentenceEnd;
    --idx;
    position = (const int) end - 1;
    while (position >= 0 && idx >= 0) {
      codes[idx] =  GetLmID(hypo.GetWord(position));
      --idx;
      --position;
    }
    while (idx>=0) {
      codes[idx] = m_lmtb_sentenceStart;
      --idx;
    }
    score += m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp);
  } else {
    // need to set the LM state

    if (adjust_end < end)   { //the LMstate of this target phrase refers to the last m_lmtb_size-1 words
      position = (const int) end - 1;
      for (idx=m_lmtb_size-1; idx>0; --idx) {
        codes[idx] =  GetLmID(hypo.GetWord(position));
      }
      codes[idx] = m_lmtb_sentenceStart;
      msp = (char *) m_lmtb->cmaxsuffptr(codes,m_lmtb_size);
    }
  }

  score = TransformLMScore(score);
  out->PlusEquals(this, score);

  std::auto_ptr<IRSTLMState> ret(new IRSTLMState(msp));

  return ret.release();
}
Пример #15
0
int LanguageModelIRST::GetLmID( const Word &word ) const
{
  return GetLmID( word.GetFactor(m_factorType) );
}
Пример #16
0
float LanguageModelInternal::GetValue(const Factor *factor0, const Factor *factor1, const Factor *factor2, State* finalState) const
{
	float score;
	const NGramNode *nGram[3];

	nGram[2]		= GetLmID(factor2);
	if (nGram[2] == NULL)
	{
		if (finalState != NULL)
			*finalState = NULL;
		score = -numeric_limits<float>::infinity();
	}
	else
	{
		nGram[1] = nGram[2]->GetNGram(factor1);
		if (nGram[1] == NULL)
		{ // something unigram
			if (finalState != NULL)
				*finalState = static_cast<const void*>(nGram[2]);
			
			nGram[1]	= GetLmID(factor1);
			if (nGram[1] == NULL)
			{ // stops at unigram
				score = nGram[2]->GetScore();
			}
			else
			{
				nGram[0] = nGram[1]->GetNGram(factor0);
				if (nGram[0] == NULL)
				{ // unigram unigram
					score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff();
				}
				else
				{ // unigram bigram
					score = nGram[2]->GetScore() + nGram[1]->GetLogBackOff() + nGram[0]->GetLogBackOff();
				}	
			}			
		}
		else
		{ // trigram, or something bigram
			nGram[0] = nGram[1]->GetNGram(factor0);
			if (nGram[0] != NULL)
			{ // trigram
				if (finalState != NULL)
					*finalState = static_cast<const void*>(nGram[0]);
				score = nGram[0]->GetScore();
			}
			else
			{
				if (finalState != NULL)
					*finalState = static_cast<const void*>(nGram[1]);
				
				score			= nGram[1]->GetScore();
				nGram[1]	= nGram[1]->GetRootNGram();
				nGram[0]	= nGram[1]->GetNGram(factor0);
				if (nGram[0] == NULL)
				{ // just bigram
					// do nothing
				}
				else
				{
					score	+= nGram[0]->GetLogBackOff();
				}

			}
			// else do nothing. just use 1st bigram
		}
	}

	return FloorScore(score);

}