Exemple #1
0
void KENLM<Model>::CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore,
                             float &ngramScore, std::size_t &oovCount) const
{
  fullScore = 0;
  ngramScore = 0;
  oovCount = 0;

  if (!phrase.GetSize()) return;

  lm::ngram::ChartState discarded_sadly;
  lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);

  size_t position;
  if (m_bos == phrase[0][m_factorType]) {
    scorer.BeginSentence();
    position = 1;
  } else {
    position = 0;
  }

  size_t ngramBoundary = m_ngram->Order() - 1;

  size_t end_loop = std::min(ngramBoundary, phrase.GetSize());
  for (; position < end_loop; ++position) {
    const SCFG::Word &word = phrase[position];
    if (word.isNonTerminal) {
      fullScore += scorer.Finish();
      scorer.Reset();
    } else {
      lm::WordIndex index = TranslateID(word);
      scorer.Terminal(index);
      if (!index) ++oovCount;
    }
  }
  float before_boundary = fullScore + scorer.Finish();
  for (; position < phrase.GetSize(); ++position) {
    const SCFG::Word &word = phrase[position];
    if (word.isNonTerminal) {
      fullScore += scorer.Finish();
      scorer.Reset();
    } else {
      lm::WordIndex index = TranslateID(word);
      scorer.Terminal(index);
      if (!index) ++oovCount;
    }
  }
  fullScore += scorer.Finish();

  ngramScore = TransformLMScore(fullScore - before_boundary);
  fullScore = TransformLMScore(fullScore);
}
Exemple #2
0
/**
 * Pre-calculate the n-gram probabilities for the words in the specified phrase.
 *
 * Note that when this method is called, we do not have access to the context
 * in which this phrase will eventually be applied.
 *
 * In other words, we know what words are in this phrase,
 * but we do not know what words will come before or after this phrase.
 *
 * The parameters fullScore, ngramScore, and oovCount are all output parameters.
 *
 * The value stored in oovCount is the number of words in the phrase
 * that are not in the language model's vocabulary.
 *
 * The sum of the ngram scores for all words in this phrase are stored in fullScore.
 *
 * The value stored in ngramScore is similar, but only full-order ngram scores are included.
 *
 * This is best shown by example:
 *
 * Assume a trigram backward language model and a phrase "a b c d e f g"
 *
 * fullScore would represent the sum of the logprob scores for the following values:
 *
 * p(g)
 * p(f | g)
 * p(e | g f)
 * p(d | f e)
 * p(c | e d)
 * p(b | d c)
 * p(a | c b)
 *
 * ngramScore would represent the sum of the logprob scores for the following values:
 *
 * p(g)
 * p(f | g)
 * p(e | g f)
 * p(d | f e)
 * p(c | e d)
 * p(b | d c)
 * p(a | c b)
 */
template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
  fullScore = 0;
  ngramScore = 0;
  oovCount = 0;

  if (!phrase.GetSize()) return;

  lm::ngram::ChartState discarded_sadly;
  lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly);

  UTIL_THROW_IF(
    (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)),
    util::Exception,
    "BackwardLanguageModel does not currently support rules that include <s>"
  );

  float before_boundary = 0.0f;

  int lastWord = phrase.GetSize() - 1;
  int ngramBoundary = m_ngram->Order() - 1;
  int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary;

  int position;
  for (position = lastWord; position >= 0; position-=1) {
    const Word &word = phrase.GetWord(position);
    UTIL_THROW_IF(
      (word.IsNonTerminal()),
      util::Exception,
      "BackwardLanguageModel does not currently support rules that include non-terminals "
    );

    lm::WordIndex index = TranslateID(word);
    scorer.Terminal(index);
    if (!index) ++oovCount;

    if (position==boundary) {
      before_boundary = scorer.Finish();
    }

  }

  fullScore = scorer.Finish();

  ngramScore = TransformLMScore(fullScore - before_boundary);
  fullScore = TransformLMScore(fullScore);

}
Exemple #3
0
LMResult LanguageModelDALM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
  LMResult ret;

  // initialize DALM array
  DALM::VocabId ngram[m_nGramOrder];
  for(size_t i = 0; i < m_nGramOrder; i++){
    ngram[i] = wid_start;
  }

  DALM::VocabId wid;
  for (size_t i = 0; i < contextFactor.size(); ++i) {
	  const Word &word = *contextFactor[i];
	  wid = GetVocabId(word.GetFactor(m_factorType));
	  push(ngram, m_nGramOrder, wid);
  }

  // last word is unk?
  ret.unknown = (wid == m_vocab->unk());

  // calc score. Doesn't handle unk yet
  float score = m_lm->query(ngram, m_nGramOrder);
  score = TransformLMScore(score);
  ret.score = score;

  (*finalState) = (void *)m_lm->get_state(ngram, m_nGramOrder);

  return ret;
}
float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
	FactorType factorType = GetFactorType();

	// set up context
	size_t count = contextFactor.size();
        if (count < 0) { cerr << "ERROR count < 0\n"; exit(100); };

        // set up context
        int codes[MAX_NGRAM_SIZE];

	size_t idx=0;
        //fill the farthest positions with at most ONE sentenceEnd symbol and at most ONE sentenceEnd symbol, if "empty" positions are available
	//so that the vector looks like = "</s> <s> context_word context_word" for a two-word context and a LM of order 5
	if (count < (size_t) (m_lmtb_size-1)) codes[idx++] = m_lmtb_sentenceEnd;  
	if (count < (size_t) m_lmtb_size) codes[idx++] = m_lmtb_sentenceStart;  

        for (size_t i = 0 ; i < count ; i++)
                codes[idx++] =  GetLmID((*contextFactor[i])[factorType]);

        float prob;
        char* msp = NULL;
        unsigned int ilen;
        prob = m_lmtb->clprob(codes,idx,NULL,NULL,&msp,&ilen);

	if (finalState) *finalState=(State *) msp;

	return TransformLMScore(prob);
}
Exemple #5
0
LMResult LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
  // set up context
  size_t count = contextFactor.size();
  if (count < 0) {
    cerr << "ERROR count < 0\n";
    exit(100);
  };

  // set up context
  int codes[MAX_NGRAM_SIZE];

  size_t idx=0;
  //fill the farthest positions with at most ONE sentenceEnd symbol and at most ONE sentenceEnd symbol, if "empty" positions are available
  //so that the vector looks like = "</s> <s> context_word context_word" for a two-word context and a LM of order 5
  if (count < (size_t) (m_lmtb_size-1)) codes[idx++] = m_lmtb_sentenceEnd;
  if (count < (size_t) m_lmtb_size) codes[idx++] = m_lmtb_sentenceStart;

  for (size_t i = 0 ; i < count ; i++) {
    codes[idx] =  GetLmID(*contextFactor[i]);
    ++idx;
  }

  LMResult result;
  result.unknown = (codes[idx - 1] == m_unknownId);

  char* msp = NULL;
  result.score = m_lmtb->clprob(codes,idx,NULL,NULL,&msp);

  if (finalState) *finalState=(State *) msp;

  result.score = TransformLMScore(result.score);

  return result;
}
float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const
{
	unsigned int dummy;
	if (!len) { len = &dummy; }
	FactorType factorType = GetFactorType();

	// set up context
	size_t count = contextFactor.size();
    
	m_lmtb_ng->size=0;
	if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd);
	if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart);  

	for (size_t i = 0 ; i < count ; i++)
	{
	  //int lmId = GetLmID((*contextFactor[i])[factorType]);
#ifdef DEBUG
	  cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n";
#endif
	  int lmId = GetLmID((*contextFactor[i])[factorType]->GetString());
	  //	  cerr << (*contextFactor[i])[factorType]->GetString() << " = " << lmId;
	  m_lmtb_ng->pushc(lmId);
	}
  
	if (finalState){        
		*finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng);	
		// back off stats not currently available
		*len = 0;	
	}

	float prob = m_lmtb->clprob(*m_lmtb_ng);
  
  
	return TransformLMScore(prob);
}
Exemple #7
0
LMResult LanguageModelSRI::GetValue(VocabIndex wordId, VocabIndex *context) const
{
  LMResult ret;
  ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context)));
  ret.unknown = (wordId == m_unknownId);
  return ret;
}
LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & /*outState */) const
{

  static WidMatrix widMatrix;

  for (int i=0; i<contextFactor.size(); i++)
    ::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex));


  for (size_t i = 0; i < contextFactor.size(); i++) {
    const Word &word = *contextFactor[i];

    for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) {
      const Factor *factor = word[ m_factorTypesOrdered[j] ];

      if (factor == NULL)
        widMatrix[i][j + 1] = 0;
      else
        widMatrix[i][j + 1] = GetLmID(factor, j);
    }

    if (widMatrix[i][1] == GetLmID(m_sentenceStartArray[0], 0) ) {
      widMatrix[i][0] = m_wtbid;
    } else if (widMatrix[i][1] == GetLmID(m_sentenceEndArray[0], 0 )) {
      widMatrix[i][0] = m_wteid;
    } else {
      widMatrix[i][0] = m_wtid;
    }
  }


  LMResult ret;
  ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() );
  ret.score = FloorScore(TransformLMScore(ret.score));
  ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId);
  return ret;

  /*if (contextFactor.size() == 0)
  {
  	return 0;
  }

  for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos )
  {
  	const Word &word = *contextFactor[currPos];

  	for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index)
  	{
  		FactorType factorType = m_factorTypesOrdered[index];
  		const Factor *factor = word[factorType];

  		(*widMatrix)[currPos][index] = GetLmID(factor, index);

  	}

  }

  float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder );
  return FloorScore(TransformLMScore(p)); */
}
void LanguageModel::Load(System &system)
{
  FactorCollection &fc = system.GetVocab();

  m_bos = fc.AddFactor(BOS_, system, false);
  m_eos = fc.AddFactor(EOS_, system, false);

  InputFileStream infile(m_path);
  size_t lineNum = 0;
  string line;
  while (getline(infile, line)) {
    if (++lineNum % 100000 == 0) {
      cerr << lineNum << " ";
    }

    vector<string> substrings = Tokenize(line, "\t");

    if (substrings.size() < 2) continue;

    assert(substrings.size() == 2 || substrings.size() == 3);

    SCORE prob = TransformLMScore(Scan<SCORE>(substrings[0]));
    if (substrings[1] == "<unk>") {
      m_oov = prob;
      continue;
    }

    SCORE backoff = 0.f;
    if (substrings.size() == 3) {
      backoff = TransformLMScore(Scan<SCORE>(substrings[2]));
    }

    // ngram
    vector<string> key = Tokenize(substrings[1], " ");

    vector<const Factor*> factorKey(key.size());
    for (size_t i = 0; i < key.size(); ++i) {
      factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false);
    }

    m_root.insert(factorKey, LMScores(prob, backoff));
  }

}
Exemple #10
0
void KENLM<Model>::EvaluateWhenApplied(const SCFG::Manager &mgr,
                                       const SCFG::Hypothesis &hypo, int featureID, Scores &scores,
                                       FFState &state) const
{
  LanguageModelChartStateKenLM &newState = static_cast<LanguageModelChartStateKenLM&>(state);
  lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState.GetChartState());
  const SCFG::TargetPhraseImpl &target = hypo.GetTargetPhrase();
  const AlignmentInfo::NonTermIndexMap &nonTermIndexMap =
    target.GetAlignNonTerm().GetNonTermIndexMap();

  const size_t size = target.GetSize();
  size_t phrasePos = 0;
  // Special cases for first word.
  if (size) {
    const SCFG::Word &word = target[0];
    if (word[m_factorType] == m_bos) {
      // Begin of sentence
      ruleScore.BeginSentence();
      phrasePos++;
    } else if (word.isNonTerminal) {
      // Non-terminal is first so we can copy instead of rescoring.
      const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
      const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
      ruleScore.BeginNonTerminal(prevState);
      phrasePos++;
    }
  }

  for (; phrasePos < size; phrasePos++) {
    const SCFG::Word &word = target[phrasePos];
    if (word.isNonTerminal) {
      const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]);
      const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState();
      ruleScore.NonTerminal(prevState);
    } else {
      ruleScore.Terminal(TranslateID(word));
    }
  }

  float score = ruleScore.Finish();
  score = TransformLMScore(score);

  // take out score from loading. This needs reworking
  //score -= target.GetScores().GetScores(*this)[0];

  bool OOVFeatureEnabled = false;
  if (OOVFeatureEnabled) {
    std::vector<float> scoresVec(2);
    scoresVec[0] = score;
    scoresVec[1] = 0.0;
    scores.PlusEquals(mgr.system, *this, scoresVec);
  } else {
    scores.PlusEquals(mgr.system, *this, score);
  }
}
Exemple #11
0
void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const
{
  fullScore = 0;
  ngramScore = 0;
  oovCount = 0;

  if ( !phrase.GetSize() ) return;

  int _min = min(m_lmtb_size - 1, (int) phrase.GetSize());

  int codes[m_lmtb_size];
  int idx = 0;
  codes[idx] = m_lmtb_sentenceStart;
  ++idx;
  int position = 0;

  char* msp = NULL;
  float before_boundary = 0.0;
  for (; position < _min; ++position) {
    codes[idx] = GetLmID(phrase.GetWord(position));
    if (codes[idx] == m_unknownId) ++oovCount;
    before_boundary += m_lmtb->clprob(codes,idx+1,NULL,NULL,&msp);
    ++idx;
  }

  ngramScore = 0.0;
  int end_loop = (int) phrase.GetSize();

  for (; position < end_loop; ++position) {
    for (idx = 1; idx < m_lmtb_size; ++idx) {
      codes[idx-1] = codes[idx];
    }
    codes[idx-1] = GetLmID(phrase.GetWord(position));
    if (codes[idx-1] == m_unknownId) ++oovCount;
    ngramScore += m_lmtb->clprob(codes,idx,NULL,NULL,&msp);
  }
  before_boundary = TransformLMScore(before_boundary);
  ngramScore = TransformLMScore(ngramScore);
  fullScore = ngramScore + before_boundary;
}
Exemple #12
0
LMResult LanguageModelDALM::GetValue(DALM::VocabId wid, DALM::State* finalState) const{
  LMResult ret;

  // last word is unk?
  ret.unknown = (wid == m_vocab->unk());

  // calc score.
  float score = m_lm->query(wid, *finalState);
  score = TransformLMScore(score);
  ret.score = score;

  return ret;
}
Exemple #13
0
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor,
                                       State* finalState) const
{
    FactorType factorType = GetFactorType();
    // set up context
    randlm::WordID ngram[MAX_NGRAM_SIZE];
    int count = contextFactor.size();
    for (int i = 0 ; i < count ; i++) {
        ngram[i] = GetLmID((*contextFactor[i])[factorType]);
        //std::cerr << m_lm->getWord(ngram[i]) << " ";
    }
    int found = 0;
    LMResult ret;
    ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState)));
    ret.unknown = count && (ngram[count - 1] == m_oov_id);
    //if (finalState)
    //  std::cerr << " = " << logprob << "(" << *finalState << ", " <<")"<< std::endl;
    //else
    //  std::cerr << " = " << logprob << std::endl;
    return ret;
}
Exemple #14
0
LMResult LanguageModelDALM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
  LMResult ret;

  // initialize DALM array
  DALM::VocabId ngram[m_nGramOrder];
  for(size_t i = 0; i < m_nGramOrder; i++){
	ngram[i] = wid_start;
  }

  DALM::VocabId wid;
  for (size_t i = 0; i < contextFactor.size(); ++i) {
	  const Word &word = *contextFactor[i];
	  wid = GetVocabId(word.GetFactor(m_factorType));
	  push(ngram, m_nGramOrder, wid);
  }

  // last word is unk?
  ret.unknown = (wid == DALM_UNK_WORD);

  // calc score. Doesn't handle unk yet
  float score = m_lm->query(ngram, m_nGramOrder);
  score = TransformLMScore(score);
  ret.score = score;

  // hash of n-1 words to use as state
  size_t startPos = (contextFactor.size() < m_nGramOrder) ? 0 : 1;

  size_t hash = 0;
  for (size_t i = startPos; i < contextFactor.size(); ++i) {
	  const Word &word = *contextFactor[i];
	  const Factor *factor = word.GetFactor(m_factorType);
      boost::hash_combine(hash, factor);
  }

  (*finalState) = (State*) hash;

  return ret;
}
Exemple #15
0
LMResult LanguageModelDALM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const
{
  LMResult ret;

  // initialize DALM array
  DALM::VocabId ngram[m_nGramOrder];
  for(size_t i = 0; i < m_nGramOrder; i++){
	ngram[i] = wid_start;
  }

  DALM::VocabId wid;
  for (size_t i = 0; i < contextFactor.size(); ++i) {
	  const Word &word = *contextFactor[i];
	  wid = GetVocabId(word.GetFactor(m_factorType));
	  push(ngram, m_nGramOrder, wid);
  }

  // last word
  ret.unknown = (wid == DALM_UNK_WORD);

  float prob = m_lm->query(ngram, m_nGramOrder);
  ret.score = TransformLMScore(prob);

  // use last word as state info
  const Factor *factor;
  size_t hash_value(const Factor &f);
  if (contextFactor.size()) {
    factor = contextFactor.back()->GetFactor(m_factorType);
  } else {
    factor = NULL;
  }

  (*finalState) = (State*) factor;

  return ret;
}
bool LanguageModelInternal::Load(const std::string &filePath
																, FactorType factorType
																, float weight
																, size_t nGramOrder)
{
	assert(nGramOrder <= 3);
	if (nGramOrder > 3)
	{
		UserMessage::Add("Can only do up to trigram. Aborting");
		abort();
	}

	VERBOSE(1, "Loading Internal LM: " << filePath << endl);
	
	FactorCollection &factorCollection = FactorCollection::Instance();

	m_filePath		= filePath;
	m_factorType	= factorType;
	m_weight			= weight;
	m_nGramOrder	= nGramOrder;

	// make sure start & end tags in factor collection
	m_sentenceStart	= factorCollection.AddFactor(Output, m_factorType, BOS_);
	m_sentenceStartArray[m_factorType] = m_sentenceStart;

	m_sentenceEnd		= factorCollection.AddFactor(Output, m_factorType, EOS_);
	m_sentenceEndArray[m_factorType] = m_sentenceEnd;

	// read in file
	VERBOSE(1, filePath << endl);

	InputFileStream 	inFile(filePath);

	// to create lookup vector later on
	size_t maxFactorId = 0; 
	map<size_t, const NGramNode*> lmIdMap;

	string line;
	int lineNo = 0;
	
	while( !getline(inFile, line, '\n').eof())
	{
		lineNo++;

		if (line.size() != 0 && line.substr(0,1) != "\\")
		{
			vector<string> tokens = Tokenize(line, "\t");
			if (tokens.size() >= 2)
			{
				// split unigram/bigram trigrams
				vector<string> factorStr = Tokenize(tokens[1], " ");

				// create / traverse down tree
				NGramCollection *ngramColl = &m_map;
				NGramNode *nGram;
				const Factor *factor;
				for (int currFactor = (int) factorStr.size() - 1 ; currFactor >= 0  ; currFactor--)
				{
					factor = factorCollection.AddFactor(Output, m_factorType, factorStr[currFactor]);
					nGram = ngramColl->GetOrCreateNGram(factor);
	
					ngramColl = nGram->GetNGramColl();

				}

				NGramNode *rootNGram = m_map.GetNGram(factor);
				nGram->SetRootNGram(rootNGram);

				// create vector of factors used in this LM
				size_t factorId = factor->GetId();
				maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId;
				lmIdMap[factorId] = rootNGram;
				//factorCollection.SetFactorLmId(factor, rootNGram);

				float score = TransformLMScore(Scan<float>(tokens[0]));
				nGram->SetScore( score );
				if (tokens.size() == 3)
				{
					float logBackOff = TransformLMScore(Scan<float>(tokens[2]));
					nGram->SetLogBackOff( logBackOff );
				}
				else
				{
					nGram->SetLogBackOff( 0 );
				}
			}
		}
	}

		// add to lookup vector in object
	m_lmIdLookup.resize(maxFactorId+1);
	fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), static_cast<const NGramNode*>(NULL));

	map<size_t, const NGramNode*>::iterator iterMap;
	for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap)
	{
		m_lmIdLookup[iterMap->first] = iterMap->second;
	}

	return true;
}
Exemple #17
0
FFState* LanguageModelIRST::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const
{
  if (!hypo.GetCurrTargetLength()) {
    std::auto_ptr<IRSTLMState> ret(new IRSTLMState(ps));
    return ret.release();
  }

  //[begin, end) in STL-like fashion.
  const int begin = (const int) hypo.GetCurrTargetWordsRange().GetStartPos();
  const int end = (const int) hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
  const int adjust_end = (const int) std::min(end, begin + m_lmtb_size - 1);

  //set up context
  //fill the farthest positions with sentenceStart symbols, if "empty" positions are available
  //so that the vector looks like = "<s> <s> context_word context_word" for a two-word context and a LM of order 5
  int codes[m_lmtb_size];
  int idx=m_lmtb_size-1;
  int position = (const int) begin;
  while (position >= 0) {
    codes[idx] =  GetLmID(hypo.GetWord(position));
    --idx;
    --position;
  }
  while (idx>=0) {
    codes[idx] = m_lmtb_sentenceStart;
    --idx;
  }

  char* msp = NULL;
  float score = m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp);

  position = (const int) begin+1;
  while (position < adjust_end) {
    for (idx=1; idx<m_lmtb_size; idx++) {
      codes[idx-1] = codes[idx];
    }
    codes[idx-1] =  GetLmID(hypo.GetWord(position));
    score += m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp);
    ++position;
  }

  //adding probability of having sentenceEnd symbol, after this phrase;
  //this could happen only when all source words are covered
  if (hypo.IsSourceCompleted()) {
    idx=m_lmtb_size-1;
    codes[idx] = m_lmtb_sentenceEnd;
    --idx;
    position = (const int) end - 1;
    while (position >= 0 && idx >= 0) {
      codes[idx] =  GetLmID(hypo.GetWord(position));
      --idx;
      --position;
    }
    while (idx>=0) {
      codes[idx] = m_lmtb_sentenceStart;
      --idx;
    }
    score += m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp);
  } else {
    // need to set the LM state

    if (adjust_end < end)   { //the LMstate of this target phrase refers to the last m_lmtb_size-1 words
      position = (const int) end - 1;
      for (idx=m_lmtb_size-1; idx>0; --idx) {
        codes[idx] =  GetLmID(hypo.GetWord(position));
      }
      codes[idx] = m_lmtb_sentenceStart;
      msp = (char *) m_lmtb->cmaxsuffptr(codes,m_lmtb_size);
    }
  }

  score = TransformLMScore(score);
  out->PlusEquals(this, score);

  std::auto_ptr<IRSTLMState> ret(new IRSTLMState(msp));

  return ret.release();
}
Exemple #18
0
LMResult LanguageModelRemote::GetValue(const std::vector<const Word*> &contextFactor, State* finalState) const
{
  LMResult ret;
  ret.unknown = false;
  size_t count = contextFactor.size();
  if (count == 0) {
    if (finalState) *finalState = NULL;
    ret.score = 0.0;
    return ret;
  }
  //std::cerr << "contextFactor.size() = " << count << "\n";
  size_t max = m_nGramOrder;
  const FactorType factor = GetFactorType();
  if (max > count) max = count;

  Cache* cur = &m_cache;
  int pc = static_cast<int>(count) - 1;
  for (int i = 0; i < pc; ++i) {
    const Factor* f = contextFactor[i]->GetFactor(factor);
    cur = &cur->tree[f ? f : BOS];
  }
  const Factor* event_word = contextFactor[pc]->GetFactor(factor);
  cur = &cur->tree[event_word ? event_word : EOS];
  if (cur->prob) {
    if (finalState) *finalState = cur->boState;
    ret.score = cur->prob;
    return ret;
  }
  cur->boState = *reinterpret_cast<const State*>(&m_curId);
  ++m_curId;

  std::ostringstream os;
  os << "prob ";
  if (event_word == NULL) {
    os << "</s>";
  } else {
    os << event_word->GetString();
  }
  for (size_t i=1; i<max; i++) {
    const Factor* f = contextFactor[count-1-i]->GetFactor(factor);
    if (f == NULL) {
      os << " <s>";
    } else {
      os << ' ' << f->GetString();
    }
  }
  os << std::endl;
  std::string out = os.str();
  write(sock, out.c_str(), out.size());
  char res[6];
  int r = read(sock, res, 6);
  int errors = 0;
  int cnt = 0;
  while (1) {
    if (r < 0) {
      errors++;
      sleep(1);
      //std::cerr << "Error: read()\n";
      if (errors > 5) exit(1);
    } else if (r==0 || res[cnt] == '\n') {
      break;
    } else {
      cnt += r;
      if (cnt==6) break;
      read(sock, &res[cnt], 6-cnt);
    }
  }
  cur->prob = FloorScore(TransformLMScore(*reinterpret_cast<float*>(res)));
  if (finalState) {
    *finalState = cur->boState;
  }
  ret.score = cur->prob;
  return ret;
}
    void testCalcScore() {

      double p_the      = -1.383059;
      double p_licenses = -2.360783;
      double p_for      = -1.661813;
      double p_most     = -2.360783;
      //      double p_software = -1.62042;

      double p_the_licenses  = -0.9625873;
      double p_licenses_for  = -1.661557;
      double p_for_most      = -0.4526253;
      //      double p_most_software = -1.70295; 

      double p_the_licenses_for  = p_the_licenses + p_licenses_for;
      //      double p_licenses_for_most = p_licenses_for + p_for_most;
 
      // the
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 1 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
      }

      // the licenses
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the licenses", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 2 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01);
      }
      
      // the licenses for
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the licenses for", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 3 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01);
      }
     
      // the licenses for most
      {
	Phrase phrase;
	BOOST_CHECK( phrase.GetSize() == 0 );

	std::vector<FactorType> outputFactorOrder;
	outputFactorOrder.push_back(0);

	phrase.CreateFromString(
				outputFactorOrder,
				"the licenses for most", 
				StaticData::Instance().GetFactorDelimiter());

	BOOST_CHECK( phrase.GetSize() == 4 );
      
	float fullScore;
	float ngramScore;
	size_t oovCount;
	backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount);

	BOOST_CHECK( oovCount == 0 );
	SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01);
	SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01);
      }
 
    }
Exemple #20
0
void KENLM<Model>::EvaluateWhenApplied(const ManagerBase &mgr,
                                       const Hypothesis &hypo, const FFState &prevState, Scores &scores,
                                       FFState &state) const
{
  KenLMState &stateCast = static_cast<KenLMState&>(state);

  const System &system = mgr.system;

  const lm::ngram::State &in_state =
    static_cast<const KenLMState&>(prevState).state;

  if (!hypo.GetTargetPhrase().GetSize()) {
    stateCast.state = in_state;
    return;
  }

  const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos();
  //[begin, end) in STL-like fashion.
  const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1;
  const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1);

  std::size_t position = begin;
  typename Model::State aux_state;
  typename Model::State *state0 = &stateCast.state, *state1 = &aux_state;

  float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)),
                               *state0);
  ++position;
  for (; position < adjust_end; ++position) {
    score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)),
                            *state1);
    std::swap(state0, state1);
  }

  if (hypo.GetBitmap().IsComplete()) {
    // Score end of sentence.
    std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
    const lm::WordIndex *last = LastIDs(hypo, &indices.front());
    score += m_ngram->FullScoreForgotState(&indices.front(), last,
                                           m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob;
  } else if (adjust_end < end) {
    // Get state after adding a long phrase.
    std::vector<lm::WordIndex> indices(m_ngram->Order() - 1);
    const lm::WordIndex *last = LastIDs(hypo, &indices.front());
    m_ngram->GetState(&indices.front(), last, stateCast.state);
  } else if (state0 != &stateCast.state) {
    // Short enough phrase that we can just reuse the state.
    stateCast.state = *state0;
  }

  score = TransformLMScore(score);

  bool OOVFeatureEnabled = false;
  if (OOVFeatureEnabled) {
    std::vector<float> scoresVec(2);
    scoresVec[0] = score;
    scoresVec[1] = 0.0;
    scores.PlusEquals(system, *this, scoresVec);
  } else {
    scores.PlusEquals(system, *this, score);
  }
}