void KENLM<Model>::CalcScore(const Phrase<SCFG::Word> &phrase, float &fullScore, float &ngramScore, std::size_t &oovCount) const { fullScore = 0; ngramScore = 0; oovCount = 0; if (!phrase.GetSize()) return; lm::ngram::ChartState discarded_sadly; lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly); size_t position; if (m_bos == phrase[0][m_factorType]) { scorer.BeginSentence(); position = 1; } else { position = 0; } size_t ngramBoundary = m_ngram->Order() - 1; size_t end_loop = std::min(ngramBoundary, phrase.GetSize()); for (; position < end_loop; ++position) { const SCFG::Word &word = phrase[position]; if (word.isNonTerminal) { fullScore += scorer.Finish(); scorer.Reset(); } else { lm::WordIndex index = TranslateID(word); scorer.Terminal(index); if (!index) ++oovCount; } } float before_boundary = fullScore + scorer.Finish(); for (; position < phrase.GetSize(); ++position) { const SCFG::Word &word = phrase[position]; if (word.isNonTerminal) { fullScore += scorer.Finish(); scorer.Reset(); } else { lm::WordIndex index = TranslateID(word); scorer.Terminal(index); if (!index) ++oovCount; } } fullScore += scorer.Finish(); ngramScore = TransformLMScore(fullScore - before_boundary); fullScore = TransformLMScore(fullScore); }
/** * Pre-calculate the n-gram probabilities for the words in the specified phrase. * * Note that when this method is called, we do not have access to the context * in which this phrase will eventually be applied. * * In other words, we know what words are in this phrase, * but we do not know what words will come before or after this phrase. * * The parameters fullScore, ngramScore, and oovCount are all output parameters. * * The value stored in oovCount is the number of words in the phrase * that are not in the language model's vocabulary. * * The sum of the ngram scores for all words in this phrase are stored in fullScore. * * The value stored in ngramScore is similar, but only full-order ngram scores are included. * * This is best shown by example: * * Assume a trigram backward language model and a phrase "a b c d e f g" * * fullScore would represent the sum of the logprob scores for the following values: * * p(g) * p(f | g) * p(e | g f) * p(d | f e) * p(c | e d) * p(b | d c) * p(a | c b) * * ngramScore would represent the sum of the logprob scores for the following values: * * p(g) * p(f | g) * p(e | g f) * p(d | f e) * p(c | e d) * p(b | d c) * p(a | c b) */ template <class Model> void BackwardLanguageModel<Model>::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { fullScore = 0; ngramScore = 0; oovCount = 0; if (!phrase.GetSize()) return; lm::ngram::ChartState discarded_sadly; lm::ngram::RuleScore<Model> scorer(*m_ngram, discarded_sadly); UTIL_THROW_IF( (m_beginSentenceFactor == phrase.GetWord(0).GetFactor(m_factorType)), util::Exception, "BackwardLanguageModel does not currently support rules that include <s>" ); float before_boundary = 0.0f; int lastWord = phrase.GetSize() - 1; int ngramBoundary = m_ngram->Order() - 1; int boundary = ( lastWord < ngramBoundary ) ? 0 : ngramBoundary; int position; for (position = lastWord; position >= 0; position-=1) { const Word &word = phrase.GetWord(position); UTIL_THROW_IF( (word.IsNonTerminal()), util::Exception, "BackwardLanguageModel does not currently support rules that include non-terminals " ); lm::WordIndex index = TranslateID(word); scorer.Terminal(index); if (!index) ++oovCount; if (position==boundary) { before_boundary = scorer.Finish(); } } fullScore = scorer.Finish(); ngramScore = TransformLMScore(fullScore - before_boundary); fullScore = TransformLMScore(fullScore); }
LMResult LanguageModelDALM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const { LMResult ret; // initialize DALM array DALM::VocabId ngram[m_nGramOrder]; for(size_t i = 0; i < m_nGramOrder; i++){ ngram[i] = wid_start; } DALM::VocabId wid; for (size_t i = 0; i < contextFactor.size(); ++i) { const Word &word = *contextFactor[i]; wid = GetVocabId(word.GetFactor(m_factorType)); push(ngram, m_nGramOrder, wid); } // last word is unk? ret.unknown = (wid == m_vocab->unk()); // calc score. Doesn't handle unk yet float score = m_lm->query(ngram, m_nGramOrder); score = TransformLMScore(score); ret.score = score; (*finalState) = (void *)m_lm->get_state(ngram, m_nGramOrder); return ret; }
float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState) const { FactorType factorType = GetFactorType(); // set up context size_t count = contextFactor.size(); if (count < 0) { cerr << "ERROR count < 0\n"; exit(100); }; // set up context int codes[MAX_NGRAM_SIZE]; size_t idx=0; //fill the farthest positions with at most ONE sentenceEnd symbol and at most ONE sentenceEnd symbol, if "empty" positions are available //so that the vector looks like = "</s> <s> context_word context_word" for a two-word context and a LM of order 5 if (count < (size_t) (m_lmtb_size-1)) codes[idx++] = m_lmtb_sentenceEnd; if (count < (size_t) m_lmtb_size) codes[idx++] = m_lmtb_sentenceStart; for (size_t i = 0 ; i < count ; i++) codes[idx++] = GetLmID((*contextFactor[i])[factorType]); float prob; char* msp = NULL; unsigned int ilen; prob = m_lmtb->clprob(codes,idx,NULL,NULL,&msp,&ilen); if (finalState) *finalState=(State *) msp; return TransformLMScore(prob); }
LMResult LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState) const { // set up context size_t count = contextFactor.size(); if (count < 0) { cerr << "ERROR count < 0\n"; exit(100); }; // set up context int codes[MAX_NGRAM_SIZE]; size_t idx=0; //fill the farthest positions with at most ONE sentenceEnd symbol and at most ONE sentenceEnd symbol, if "empty" positions are available //so that the vector looks like = "</s> <s> context_word context_word" for a two-word context and a LM of order 5 if (count < (size_t) (m_lmtb_size-1)) codes[idx++] = m_lmtb_sentenceEnd; if (count < (size_t) m_lmtb_size) codes[idx++] = m_lmtb_sentenceStart; for (size_t i = 0 ; i < count ; i++) { codes[idx] = GetLmID(*contextFactor[i]); ++idx; } LMResult result; result.unknown = (codes[idx - 1] == m_unknownId); char* msp = NULL; result.score = m_lmtb->clprob(codes,idx,NULL,NULL,&msp); if (finalState) *finalState=(State *) msp; result.score = TransformLMScore(result.score); return result; }
float LanguageModelIRST::GetValue(const vector<const Word*> &contextFactor, State* finalState, unsigned int* len) const { unsigned int dummy; if (!len) { len = &dummy; } FactorType factorType = GetFactorType(); // set up context size_t count = contextFactor.size(); m_lmtb_ng->size=0; if (count< (size_t)(m_lmtb_size-1)) m_lmtb_ng->pushc(m_lmtb_sentenceEnd); if (count< (size_t)m_lmtb_size) m_lmtb_ng->pushc(m_lmtb_sentenceStart); for (size_t i = 0 ; i < count ; i++) { //int lmId = GetLmID((*contextFactor[i])[factorType]); #ifdef DEBUG cout << "i=" << i << " -> " << (*contextFactor[i])[factorType]->GetString() << "\n"; #endif int lmId = GetLmID((*contextFactor[i])[factorType]->GetString()); // cerr << (*contextFactor[i])[factorType]->GetString() << " = " << lmId; m_lmtb_ng->pushc(lmId); } if (finalState){ *finalState=(State *)m_lmtb->cmaxsuffptr(*m_lmtb_ng); // back off stats not currently available *len = 0; } float prob = m_lmtb->clprob(*m_lmtb_ng); return TransformLMScore(prob); }
LMResult LanguageModelSRI::GetValue(VocabIndex wordId, VocabIndex *context) const { LMResult ret; ret.score = FloorScore(TransformLMScore(m_srilmModel->wordProb( wordId, context))); ret.unknown = (wordId == m_unknownId); return ret; }
LMResult LanguageModelParallelBackoff::GetValueForgotState(const std::vector<const Word*> &contextFactor, FFState & /*outState */) const { static WidMatrix widMatrix; for (int i=0; i<contextFactor.size(); i++) ::memset(widMatrix[i],0,(m_factorTypesOrdered.size() + 1)*sizeof(VocabIndex)); for (size_t i = 0; i < contextFactor.size(); i++) { const Word &word = *contextFactor[i]; for (size_t j = 0; j < m_factorTypesOrdered.size(); j++) { const Factor *factor = word[ m_factorTypesOrdered[j] ]; if (factor == NULL) widMatrix[i][j + 1] = 0; else widMatrix[i][j + 1] = GetLmID(factor, j); } if (widMatrix[i][1] == GetLmID(m_sentenceStartArray[0], 0) ) { widMatrix[i][0] = m_wtbid; } else if (widMatrix[i][1] == GetLmID(m_sentenceEndArray[0], 0 )) { widMatrix[i][0] = m_wteid; } else { widMatrix[i][0] = m_wtid; } } LMResult ret; ret.score = m_srilmModel->wordProb( widMatrix, contextFactor.size() - 1, contextFactor.size() ); ret.score = FloorScore(TransformLMScore(ret.score)); ret.unknown = !contextFactor.empty() && (widMatrix[contextFactor.size() - 1][0] == m_unknownId); return ret; /*if (contextFactor.size() == 0) { return 0; } for (size_t currPos = 0 ; currPos < m_nGramOrder ; ++currPos ) { const Word &word = *contextFactor[currPos]; for (size_t index = 0 ; index < m_factorTypesOrdered.size() ; ++index) { FactorType factorType = m_factorTypesOrdered[index]; const Factor *factor = word[factorType]; (*widMatrix)[currPos][index] = GetLmID(factor, index); } } float p = m_srilmModel->wordProb( (*widMatrix), m_nGramOrder - 1, m_nGramOrder ); return FloorScore(TransformLMScore(p)); */ }
void LanguageModel::Load(System &system) { FactorCollection &fc = system.GetVocab(); m_bos = fc.AddFactor(BOS_, system, false); m_eos = fc.AddFactor(EOS_, system, false); InputFileStream infile(m_path); size_t lineNum = 0; string line; while (getline(infile, line)) { if (++lineNum % 100000 == 0) { cerr << lineNum << " "; } vector<string> substrings = Tokenize(line, "\t"); if (substrings.size() < 2) continue; assert(substrings.size() == 2 || substrings.size() == 3); SCORE prob = TransformLMScore(Scan<SCORE>(substrings[0])); if (substrings[1] == "<unk>") { m_oov = prob; continue; } SCORE backoff = 0.f; if (substrings.size() == 3) { backoff = TransformLMScore(Scan<SCORE>(substrings[2])); } // ngram vector<string> key = Tokenize(substrings[1], " "); vector<const Factor*> factorKey(key.size()); for (size_t i = 0; i < key.size(); ++i) { factorKey[factorKey.size() - i - 1] = fc.AddFactor(key[i], system, false); } m_root.insert(factorKey, LMScores(prob, backoff)); } }
void KENLM<Model>::EvaluateWhenApplied(const SCFG::Manager &mgr, const SCFG::Hypothesis &hypo, int featureID, Scores &scores, FFState &state) const { LanguageModelChartStateKenLM &newState = static_cast<LanguageModelChartStateKenLM&>(state); lm::ngram::RuleScore<Model> ruleScore(*m_ngram, newState.GetChartState()); const SCFG::TargetPhraseImpl &target = hypo.GetTargetPhrase(); const AlignmentInfo::NonTermIndexMap &nonTermIndexMap = target.GetAlignNonTerm().GetNonTermIndexMap(); const size_t size = target.GetSize(); size_t phrasePos = 0; // Special cases for first word. if (size) { const SCFG::Word &word = target[0]; if (word[m_factorType] == m_bos) { // Begin of sentence ruleScore.BeginSentence(); phrasePos++; } else if (word.isNonTerminal) { // Non-terminal is first so we can copy instead of rescoring. const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState(); ruleScore.BeginNonTerminal(prevState); phrasePos++; } } for (; phrasePos < size; phrasePos++) { const SCFG::Word &word = target[phrasePos]; if (word.isNonTerminal) { const SCFG::Hypothesis *prevHypo = hypo.GetPrevHypo(nonTermIndexMap[phrasePos]); const lm::ngram::ChartState &prevState = static_cast<const LanguageModelChartStateKenLM*>(prevHypo->GetState(featureID))->GetChartState(); ruleScore.NonTerminal(prevState); } else { ruleScore.Terminal(TranslateID(word)); } } float score = ruleScore.Finish(); score = TransformLMScore(score); // take out score from loading. This needs reworking //score -= target.GetScores().GetScores(*this)[0]; bool OOVFeatureEnabled = false; if (OOVFeatureEnabled) { std::vector<float> scoresVec(2); scoresVec[0] = score; scoresVec[1] = 0.0; scores.PlusEquals(mgr.system, *this, scoresVec); } else { scores.PlusEquals(mgr.system, *this, score); } }
void LanguageModelIRST::CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const { fullScore = 0; ngramScore = 0; oovCount = 0; if ( !phrase.GetSize() ) return; int _min = min(m_lmtb_size - 1, (int) phrase.GetSize()); int codes[m_lmtb_size]; int idx = 0; codes[idx] = m_lmtb_sentenceStart; ++idx; int position = 0; char* msp = NULL; float before_boundary = 0.0; for (; position < _min; ++position) { codes[idx] = GetLmID(phrase.GetWord(position)); if (codes[idx] == m_unknownId) ++oovCount; before_boundary += m_lmtb->clprob(codes,idx+1,NULL,NULL,&msp); ++idx; } ngramScore = 0.0; int end_loop = (int) phrase.GetSize(); for (; position < end_loop; ++position) { for (idx = 1; idx < m_lmtb_size; ++idx) { codes[idx-1] = codes[idx]; } codes[idx-1] = GetLmID(phrase.GetWord(position)); if (codes[idx-1] == m_unknownId) ++oovCount; ngramScore += m_lmtb->clprob(codes,idx,NULL,NULL,&msp); } before_boundary = TransformLMScore(before_boundary); ngramScore = TransformLMScore(ngramScore); fullScore = ngramScore + before_boundary; }
LMResult LanguageModelDALM::GetValue(DALM::VocabId wid, DALM::State* finalState) const{ LMResult ret; // last word is unk? ret.unknown = (wid == m_vocab->unk()); // calc score. float score = m_lm->query(wid, *finalState); score = TransformLMScore(score); ret.score = score; return ret; }
LMResult LanguageModelRandLM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const { FactorType factorType = GetFactorType(); // set up context randlm::WordID ngram[MAX_NGRAM_SIZE]; int count = contextFactor.size(); for (int i = 0 ; i < count ; i++) { ngram[i] = GetLmID((*contextFactor[i])[factorType]); //std::cerr << m_lm->getWord(ngram[i]) << " "; } int found = 0; LMResult ret; ret.score = FloorScore(TransformLMScore(m_lm->getProb(&ngram[0], count, &found, finalState))); ret.unknown = count && (ngram[count - 1] == m_oov_id); //if (finalState) // std::cerr << " = " << logprob << "(" << *finalState << ", " <<")"<< std::endl; //else // std::cerr << " = " << logprob << std::endl; return ret; }
LMResult LanguageModelDALM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const { LMResult ret; // initialize DALM array DALM::VocabId ngram[m_nGramOrder]; for(size_t i = 0; i < m_nGramOrder; i++){ ngram[i] = wid_start; } DALM::VocabId wid; for (size_t i = 0; i < contextFactor.size(); ++i) { const Word &word = *contextFactor[i]; wid = GetVocabId(word.GetFactor(m_factorType)); push(ngram, m_nGramOrder, wid); } // last word is unk? ret.unknown = (wid == DALM_UNK_WORD); // calc score. Doesn't handle unk yet float score = m_lm->query(ngram, m_nGramOrder); score = TransformLMScore(score); ret.score = score; // hash of n-1 words to use as state size_t startPos = (contextFactor.size() < m_nGramOrder) ? 0 : 1; size_t hash = 0; for (size_t i = startPos; i < contextFactor.size(); ++i) { const Word &word = *contextFactor[i]; const Factor *factor = word.GetFactor(m_factorType); boost::hash_combine(hash, factor); } (*finalState) = (State*) hash; return ret; }
LMResult LanguageModelDALM::GetValue(const vector<const Word*> &contextFactor, State* finalState) const { LMResult ret; // initialize DALM array DALM::VocabId ngram[m_nGramOrder]; for(size_t i = 0; i < m_nGramOrder; i++){ ngram[i] = wid_start; } DALM::VocabId wid; for (size_t i = 0; i < contextFactor.size(); ++i) { const Word &word = *contextFactor[i]; wid = GetVocabId(word.GetFactor(m_factorType)); push(ngram, m_nGramOrder, wid); } // last word ret.unknown = (wid == DALM_UNK_WORD); float prob = m_lm->query(ngram, m_nGramOrder); ret.score = TransformLMScore(prob); // use last word as state info const Factor *factor; size_t hash_value(const Factor &f); if (contextFactor.size()) { factor = contextFactor.back()->GetFactor(m_factorType); } else { factor = NULL; } (*finalState) = (State*) factor; return ret; }
bool LanguageModelInternal::Load(const std::string &filePath , FactorType factorType , float weight , size_t nGramOrder) { assert(nGramOrder <= 3); if (nGramOrder > 3) { UserMessage::Add("Can only do up to trigram. Aborting"); abort(); } VERBOSE(1, "Loading Internal LM: " << filePath << endl); FactorCollection &factorCollection = FactorCollection::Instance(); m_filePath = filePath; m_factorType = factorType; m_weight = weight; m_nGramOrder = nGramOrder; // make sure start & end tags in factor collection m_sentenceStart = factorCollection.AddFactor(Output, m_factorType, BOS_); m_sentenceStartArray[m_factorType] = m_sentenceStart; m_sentenceEnd = factorCollection.AddFactor(Output, m_factorType, EOS_); m_sentenceEndArray[m_factorType] = m_sentenceEnd; // read in file VERBOSE(1, filePath << endl); InputFileStream inFile(filePath); // to create lookup vector later on size_t maxFactorId = 0; map<size_t, const NGramNode*> lmIdMap; string line; int lineNo = 0; while( !getline(inFile, line, '\n').eof()) { lineNo++; if (line.size() != 0 && line.substr(0,1) != "\\") { vector<string> tokens = Tokenize(line, "\t"); if (tokens.size() >= 2) { // split unigram/bigram trigrams vector<string> factorStr = Tokenize(tokens[1], " "); // create / traverse down tree NGramCollection *ngramColl = &m_map; NGramNode *nGram; const Factor *factor; for (int currFactor = (int) factorStr.size() - 1 ; currFactor >= 0 ; currFactor--) { factor = factorCollection.AddFactor(Output, m_factorType, factorStr[currFactor]); nGram = ngramColl->GetOrCreateNGram(factor); ngramColl = nGram->GetNGramColl(); } NGramNode *rootNGram = m_map.GetNGram(factor); nGram->SetRootNGram(rootNGram); // create vector of factors used in this LM size_t factorId = factor->GetId(); maxFactorId = (factorId > maxFactorId) ? factorId : maxFactorId; lmIdMap[factorId] = rootNGram; //factorCollection.SetFactorLmId(factor, rootNGram); float score = TransformLMScore(Scan<float>(tokens[0])); nGram->SetScore( score ); if (tokens.size() == 3) { float logBackOff = TransformLMScore(Scan<float>(tokens[2])); nGram->SetLogBackOff( logBackOff ); } else { nGram->SetLogBackOff( 0 ); } } } } // add to lookup vector in object m_lmIdLookup.resize(maxFactorId+1); fill(m_lmIdLookup.begin(), m_lmIdLookup.end(), static_cast<const NGramNode*>(NULL)); map<size_t, const NGramNode*>::iterator iterMap; for (iterMap = lmIdMap.begin() ; iterMap != lmIdMap.end() ; ++iterMap) { m_lmIdLookup[iterMap->first] = iterMap->second; } return true; }
FFState* LanguageModelIRST::EvaluateWhenApplied(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const { if (!hypo.GetCurrTargetLength()) { std::auto_ptr<IRSTLMState> ret(new IRSTLMState(ps)); return ret.release(); } //[begin, end) in STL-like fashion. const int begin = (const int) hypo.GetCurrTargetWordsRange().GetStartPos(); const int end = (const int) hypo.GetCurrTargetWordsRange().GetEndPos() + 1; const int adjust_end = (const int) std::min(end, begin + m_lmtb_size - 1); //set up context //fill the farthest positions with sentenceStart symbols, if "empty" positions are available //so that the vector looks like = "<s> <s> context_word context_word" for a two-word context and a LM of order 5 int codes[m_lmtb_size]; int idx=m_lmtb_size-1; int position = (const int) begin; while (position >= 0) { codes[idx] = GetLmID(hypo.GetWord(position)); --idx; --position; } while (idx>=0) { codes[idx] = m_lmtb_sentenceStart; --idx; } char* msp = NULL; float score = m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp); position = (const int) begin+1; while (position < adjust_end) { for (idx=1; idx<m_lmtb_size; idx++) { codes[idx-1] = codes[idx]; } codes[idx-1] = GetLmID(hypo.GetWord(position)); score += m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp); ++position; } //adding probability of having sentenceEnd symbol, after this phrase; //this could happen only when all source words are covered if (hypo.IsSourceCompleted()) { idx=m_lmtb_size-1; codes[idx] = m_lmtb_sentenceEnd; --idx; position = (const int) end - 1; while (position >= 0 && idx >= 0) { codes[idx] = GetLmID(hypo.GetWord(position)); --idx; --position; } while (idx>=0) { codes[idx] = m_lmtb_sentenceStart; --idx; } score += m_lmtb->clprob(codes,m_lmtb_size,NULL,NULL,&msp); } else { // need to set the LM state if (adjust_end < end) { //the LMstate of this target phrase refers to the last m_lmtb_size-1 words position = (const int) end - 1; for (idx=m_lmtb_size-1; idx>0; --idx) { codes[idx] = GetLmID(hypo.GetWord(position)); } codes[idx] = m_lmtb_sentenceStart; msp = (char *) m_lmtb->cmaxsuffptr(codes,m_lmtb_size); } } score = TransformLMScore(score); out->PlusEquals(this, score); std::auto_ptr<IRSTLMState> ret(new IRSTLMState(msp)); return ret.release(); }
LMResult LanguageModelRemote::GetValue(const std::vector<const Word*> &contextFactor, State* finalState) const { LMResult ret; ret.unknown = false; size_t count = contextFactor.size(); if (count == 0) { if (finalState) *finalState = NULL; ret.score = 0.0; return ret; } //std::cerr << "contextFactor.size() = " << count << "\n"; size_t max = m_nGramOrder; const FactorType factor = GetFactorType(); if (max > count) max = count; Cache* cur = &m_cache; int pc = static_cast<int>(count) - 1; for (int i = 0; i < pc; ++i) { const Factor* f = contextFactor[i]->GetFactor(factor); cur = &cur->tree[f ? f : BOS]; } const Factor* event_word = contextFactor[pc]->GetFactor(factor); cur = &cur->tree[event_word ? event_word : EOS]; if (cur->prob) { if (finalState) *finalState = cur->boState; ret.score = cur->prob; return ret; } cur->boState = *reinterpret_cast<const State*>(&m_curId); ++m_curId; std::ostringstream os; os << "prob "; if (event_word == NULL) { os << "</s>"; } else { os << event_word->GetString(); } for (size_t i=1; i<max; i++) { const Factor* f = contextFactor[count-1-i]->GetFactor(factor); if (f == NULL) { os << " <s>"; } else { os << ' ' << f->GetString(); } } os << std::endl; std::string out = os.str(); write(sock, out.c_str(), out.size()); char res[6]; int r = read(sock, res, 6); int errors = 0; int cnt = 0; while (1) { if (r < 0) { errors++; sleep(1); //std::cerr << "Error: read()\n"; if (errors > 5) exit(1); } else if (r==0 || res[cnt] == '\n') { break; } else { cnt += r; if (cnt==6) break; read(sock, &res[cnt], 6-cnt); } } cur->prob = FloorScore(TransformLMScore(*reinterpret_cast<float*>(res))); if (finalState) { *finalState = cur->boState; } ret.score = cur->prob; return ret; }
void testCalcScore() { double p_the = -1.383059; double p_licenses = -2.360783; double p_for = -1.661813; double p_most = -2.360783; // double p_software = -1.62042; double p_the_licenses = -0.9625873; double p_licenses_for = -1.661557; double p_for_most = -0.4526253; // double p_most_software = -1.70295; double p_the_licenses_for = p_the_licenses + p_licenses_for; // double p_licenses_for_most = p_licenses_for + p_for_most; // the { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 1 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); } // the licenses { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the licenses", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 2 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); } // the licenses for { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the licenses for", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 3 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01); } // the licenses for most { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the licenses for most", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 4 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01); } }
void KENLM<Model>::EvaluateWhenApplied(const ManagerBase &mgr, const Hypothesis &hypo, const FFState &prevState, Scores &scores, FFState &state) const { KenLMState &stateCast = static_cast<KenLMState&>(state); const System &system = mgr.system; const lm::ngram::State &in_state = static_cast<const KenLMState&>(prevState).state; if (!hypo.GetTargetPhrase().GetSize()) { stateCast.state = in_state; return; } const std::size_t begin = hypo.GetCurrTargetWordsRange().GetStartPos(); //[begin, end) in STL-like fashion. const std::size_t end = hypo.GetCurrTargetWordsRange().GetEndPos() + 1; const std::size_t adjust_end = std::min(end, begin + m_ngram->Order() - 1); std::size_t position = begin; typename Model::State aux_state; typename Model::State *state0 = &stateCast.state, *state1 = &aux_state; float score = m_ngram->Score(in_state, TranslateID(hypo.GetWord(position)), *state0); ++position; for (; position < adjust_end; ++position) { score += m_ngram->Score(*state0, TranslateID(hypo.GetWord(position)), *state1); std::swap(state0, state1); } if (hypo.GetBitmap().IsComplete()) { // Score end of sentence. std::vector<lm::WordIndex> indices(m_ngram->Order() - 1); const lm::WordIndex *last = LastIDs(hypo, &indices.front()); score += m_ngram->FullScoreForgotState(&indices.front(), last, m_ngram->GetVocabulary().EndSentence(), stateCast.state).prob; } else if (adjust_end < end) { // Get state after adding a long phrase. std::vector<lm::WordIndex> indices(m_ngram->Order() - 1); const lm::WordIndex *last = LastIDs(hypo, &indices.front()); m_ngram->GetState(&indices.front(), last, stateCast.state); } else if (state0 != &stateCast.state) { // Short enough phrase that we can just reuse the state. stateCast.state = *state0; } score = TransformLMScore(score); bool OOVFeatureEnabled = false; if (OOVFeatureEnabled) { std::vector<float> scoresVec(2); scoresVec[0] = score; scoresVec[1] = 0.0; scores.PlusEquals(system, *this, scoresVec); } else { scores.PlusEquals(system, *this, score); } }