float GlobalLexicalModel::ScorePhrase( const TargetPhrase& targetPhrase ) const { const Sentence& input = *(m_local->input); float score = 0; for(size_t targetIndex = 0; targetIndex < targetPhrase.GetSize(); targetIndex++ ) { float sum = 0; const Word& targetWord = targetPhrase.GetWord( targetIndex ); VERBOSE(2,"glm " << targetWord << ": "); const DoubleHash::const_iterator targetWordHash = m_hash.find( &targetWord ); if( targetWordHash != m_hash.end() ) { SingleHash::const_iterator inputWordHash = targetWordHash->second.find( m_bias ); if( inputWordHash != targetWordHash->second.end() ) { VERBOSE(2,"*BIAS* " << inputWordHash->second); sum += inputWordHash->second; } set< const Word*, WordComparer > alreadyScored; // do not score a word twice for(size_t inputIndex = 0; inputIndex < input.GetSize(); inputIndex++ ) { const Word& inputWord = input.GetWord( inputIndex ); if ( alreadyScored.find( &inputWord ) == alreadyScored.end() ) { SingleHash::const_iterator inputWordHash = targetWordHash->second.find( &inputWord ); if( inputWordHash != targetWordHash->second.end() ) { VERBOSE(2," " << inputWord << " " << inputWordHash->second); sum += inputWordHash->second; } alreadyScored.insert( &inputWord ); } } } // Hal Daume says: 1/( 1 + exp [ - sum_i w_i * f_i ] ) VERBOSE(2," p=" << FloorScore( log(1/(1+exp(-sum))) ) << endl); score += FloorScore( log(1/(1+exp(-sum))) ); } return score; }
void CacheBasedLanguageModel::Evaluate_Whole_String(const TargetPhrase& tp, ScoreComponentCollection* out) const { //VERBOSE(1,"CacheBasedLanguageModel::Evaluate_Whole_String" << std::endl); //consider all words in the TargetPhrase as one n-gram // and compute the decaying_score for all words // and return their sum decaying_cache_t::const_iterator it; float score = 0.0; std::string w = ""; size_t endpos = tp.GetSize(); for (size_t pos = 0 ; pos < endpos ; ++pos) { if (pos > 0){ w += " "; } w += tp.GetWord(pos).GetFactor(0)->GetString(); } it = m_cache.find(w); if (it != m_cache.end()) //found! { score = ((*it).second).second; VERBOSE(3,"cblm::Evaluate: found w:|" << w << "| score:|" << score << "|" << std::endl); } else{ score = precomputedScores[maxAge]; // one score per phrase table VERBOSE(3,"cblm::Evaluate: not found w:|" << w << "| score:|" << score << "|" << std::endl); } VERBOSE(3,"cblm::Evaluate: phrase:|" << tp << "| score:|" << score << "|" << std::endl); out->PlusEquals(this, score); }
void CacheBasedLanguageModel::Evaluate_All_Substrings(const TargetPhrase& tp, ScoreComponentCollection* out) const { //VERBOSE(1,"CacheBasedLanguageModel::Evaluate_All_Substrings" << std::endl); //loop over all n-grams in the TargetPhrase (no matter of n) // and compute the decaying_score for all words // and return their sum decaying_cache_t::const_iterator it; float score = 0.0; size_t tp_size = tp.GetSize(); for (size_t startpos = 0 ; startpos < tp_size ; ++startpos) { std::string w = ""; for (size_t endpos = startpos; endpos < tp_size ; ++endpos) { if (endpos > startpos){ w += " "; } w += tp.GetWord(endpos).GetFactor(0)->GetString(); it = m_cache.find(w); float tmpsc; if (it != m_cache.end()) //found! { tmpsc = ((*it).second).second; VERBOSE(3,"cblm::Evaluate: found w:|" << w << "| score:|" << tmpsc << "|" << std::endl); } else{ tmpsc = precomputedScores[maxAge]; // one score per phrase table VERBOSE(3,"cblm::Evaluate: not found w:|" << w << "| score:|" << tmpsc << "|" << std::endl); } score += ( tmpsc / ( tp_size + startpos - endpos ) ); VERBOSE(3,"cblm::Evaluate: actual score:|" << score << "|" << std::endl); } } VERBOSE(3,"cblm::Evaluate: phrase:|" << tp << "| score:|" << score << "|" << std::endl); out->PlusEquals(this, score); }
void OpSequenceModel:: Evaluate(const Phrase &source , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &estimatedFutureScore) const { osmHypothesis obj; obj.setState(OSM->NullContextState()); WordsBitmap myBitmap(source.GetSize()); vector <string> mySourcePhrase; vector <string> myTargetPhrase; vector<float> scores(5); vector <int> alignments; int startIndex = 0; int endIndex = source.GetSize(); const AlignmentInfo &align = targetPhrase.GetAlignTerm(); AlignmentInfo::const_iterator iter; for (iter = align.begin(); iter != align.end(); ++iter) { alignments.push_back(iter->first); alignments.push_back(iter->second); } for (int i = 0; i < targetPhrase.GetSize(); i++) { if (targetPhrase.GetWord(i).IsOOV()) myTargetPhrase.push_back("_TRANS_SLF_"); else myTargetPhrase.push_back(targetPhrase.GetWord(i).GetFactor(0)->GetString().as_string()); } for (int i = 0; i < source.GetSize(); i++) { mySourcePhrase.push_back(source.GetWord(i).GetFactor(0)->GetString().as_string()); } obj.setPhrases(mySourcePhrase , myTargetPhrase); obj.constructCepts(alignments,startIndex,endIndex-1,targetPhrase.GetSize()); obj.computeOSMFeature(startIndex,myBitmap); obj.calculateOSMProb(*OSM); obj.populateScores(scores); estimatedFutureScore.PlusEquals(this, scores); }
void Model1Feature::EvaluateWithSourceContext(const InputType &input , const InputPath &inputPath , const TargetPhrase &targetPhrase , const StackVec *stackVec , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection *estimatedFutureScore) const { const Sentence& sentence = static_cast<const Sentence&>(input); float score = 0.0; float norm = TransformScore(1+sentence.GetSize()); for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) { const Word &wordT = targetPhrase.GetWord(posT); if ( !wordT.IsNonTerminal() ) { float thisWordProb = m_model1.GetProbability(m_emptyWord,wordT[0]); // probability conditioned on empty word // cache lookup bool foundInCache = false; { #ifdef WITH_THREADS boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock); #endif boost::unordered_map<const InputType*, boost::unordered_map<const Factor*, float> >::const_iterator sentenceCache = m_cache.find(&input); if (sentenceCache != m_cache.end()) { boost::unordered_map<const Factor*, float>::const_iterator cacheHit = sentenceCache->second.find(wordT[0]); if (cacheHit != sentenceCache->second.end()) { foundInCache = true; score += cacheHit->second; FEATUREVERBOSE(3, "Cached score( " << wordT << " ) = " << cacheHit->second << std::endl); } } } if (!foundInCache) { for (size_t posS=1; posS<sentence.GetSize()-1; ++posS) { // ignore <s> and </s> const Word &wordS = sentence.GetWord(posS); float modelProb = m_model1.GetProbability(wordS[0],wordT[0]); FEATUREVERBOSE(4, "p( " << wordT << " | " << wordS << " ) = " << modelProb << std::endl); thisWordProb += modelProb; } float thisWordScore = TransformScore(thisWordProb) - norm; FEATUREVERBOSE(3, "score( " << wordT << " ) = " << thisWordScore << std::endl); { #ifdef WITH_THREADS // need to update cache; write lock boost::unique_lock<boost::shared_mutex> lock(m_accessLock); #endif m_cache[&input][wordT[0]] = thisWordScore; } score += thisWordScore; } } } scoreBreakdown.PlusEquals(this, score); }
void CountNonTerms::Evaluate(const Phrase &sourcePhrase , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &estimatedFutureScore) const { const StaticData &staticData = StaticData::Instance(); vector<float> scores(m_numScoreComponents, 0); size_t indScore = 0; if (m_all) { for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { const Word &word = targetPhrase.GetWord(i); if (word.IsNonTerminal()) { ++scores[indScore]; } } ++indScore; } if (m_targetSyntax) { for (size_t i = 0; i < targetPhrase.GetSize(); ++i) { const Word &word = targetPhrase.GetWord(i); if (word.IsNonTerminal() && word != staticData.GetOutputDefaultNonTerminal()) { ++scores[indScore]; } } ++indScore; } if (m_sourceSyntax) { for (size_t i = 0; i < sourcePhrase.GetSize(); ++i) { const Word &word = sourcePhrase.GetWord(i); if (word.IsNonTerminal() && word != staticData.GetInputDefaultNonTerminal()) { ++scores[indScore]; } } ++indScore; } scoreBreakdown.PlusEquals(this, scores); }
void TargetWordInsertionFeature::ComputeFeatures(const TargetPhrase& targetPhrase, ScoreComponentCollection* accumulator, const AlignmentInfo::CollType &alignment) const { // handle special case: unknown words (they have no word alignment) size_t targetLength = targetPhrase.GetSize(); size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize(); if (targetLength == 1 && sourceLength == 1) { const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1); if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) { return; } } // flag aligned words bool aligned[16]; CHECK(targetLength < 16); for(size_t i=0; i<targetLength; i++) { aligned[i] = false; } for (AlignmentInfo::const_iterator alignmentPoint = alignment.begin(); alignmentPoint != alignment.end(); alignmentPoint++) { aligned[ alignmentPoint->second ] = true; } // process unaligned target words for(size_t i=0; i<targetLength; i++) { if (!aligned[i]) { Word w = targetPhrase.GetWord(i); if (!w.IsNonTerminal()) { const string &word = w.GetFactor(m_factorType)->GetString(); if (word != "<s>" && word != "</s>") { if (!m_unrestricted && m_vocab.find( word ) == m_vocab.end()) { accumulator->PlusEquals(this,"OTHER",1); } else { accumulator->PlusEquals(this,word,1); } } } } } }
void RulePairUnlexicalizedSource::EvaluateInIsolation(const Phrase &source , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &estimatedFutureScore) const { const Factor* targetPhraseLHS = targetPhrase.GetTargetLHS()[0]; if ( !m_glueRules && (targetPhraseLHS == m_glueTargetLHS) ) { return; } if ( !m_nonGlueRules && (targetPhraseLHS != m_glueTargetLHS) ) { return; } for (size_t posS=0; posS<source.GetSize(); ++posS) { const Word &wordS = source.GetWord(posS); if ( !wordS.IsNonTerminal() ) { return; } } ostringstream namestr; for (size_t posT=0; posT<targetPhrase.GetSize(); ++posT) { const Word &wordT = targetPhrase.GetWord(posT); const Factor* factorT = wordT[0]; if ( wordT.IsNonTerminal() ) { namestr << "["; } namestr << factorT->GetString(); if ( wordT.IsNonTerminal() ) { namestr << "]"; } namestr << "|"; } namestr << targetPhraseLHS->GetString() << "|"; for (AlignmentInfo::const_iterator it=targetPhrase.GetAlignNonTerm().begin(); it!=targetPhrase.GetAlignNonTerm().end(); ++it) { namestr << "|" << it->first << "-" << it->second; } scoreBreakdown.PlusEquals(this, namestr.str(), 1); if ( targetPhraseLHS != m_glueTargetLHS ) { scoreBreakdown.PlusEquals(this, 1); } }
template <class Model> void Fill<Model>::AddPhraseOOV(TargetPhrase &phrase, std::list<TargetPhraseCollection*> &, const WordsRange &) { std::vector<lm::WordIndex> words; CHECK(phrase.GetSize() <= 1); if (phrase.GetSize()) words.push_back(Convert(phrase.GetWord(0))); search::PartialEdge edge(edges_.AllocateEdge(0)); // Appears to be a bug that FutureScore does not already include language model. search::ScoreRuleRet scored(search::ScoreRule(context_.LanguageModel(), words, edge.Between())); edge.SetScore(phrase.GetFutureScore() + scored.prob * context_.LMWeight() + static_cast<search::Score>(scored.oov) * oov_weight_); search::Note note; note.vp = &phrase; edge.SetNote(note); edges_.AddEdge(edge); }
TargetPhrase *TargetPhrase::MergeNext(const TargetPhrase &inputPhrase) const { if (! IsCompatible(inputPhrase)) { return NULL; } // ok, merge TargetPhrase *clone = new TargetPhrase(*this); clone->m_sourcePhrase = m_sourcePhrase; int currWord = 0; const size_t len = GetSize(); for (size_t currPos = 0 ; currPos < len ; currPos++) { const Word &inputWord = inputPhrase.GetWord(currPos); Word &cloneWord = clone->GetWord(currPos); cloneWord.Merge(inputWord); currWord++; } return clone; }
TargetPhraseCollection::shared_ptr UTrieNode:: GetOrCreateTargetPhraseCollection(const TargetPhrase &target) { const AlignmentInfo &alignmentInfo = target.GetAlignNonTerm(); const size_t rank = alignmentInfo.GetSize(); std::vector<int> vec; vec.reserve(rank); m_labelTable.resize(rank); int i = 0; for (AlignmentInfo::const_iterator p = alignmentInfo.begin(); p != alignmentInfo.end(); ++p) { size_t targetNonTermIndex = p->second; const Word &targetNonTerm = target.GetWord(targetNonTermIndex); vec.push_back(InsertLabel(i++, targetNonTerm)); } TargetPhraseCollection::shared_ptr& ret = m_labelMap[vec]; if (ret == NULL) ret.reset(new TargetPhraseCollection); return ret; }
void TargetWordInsertionFeature::ComputeFeatures(const Phrase &source, const TargetPhrase& targetPhrase, ScoreComponentCollection* accumulator, const AlignmentInfo &alignmentInfo) const { // handle special case: unknown words (they have no word alignment) size_t targetLength = targetPhrase.GetSize(); size_t sourceLength = source.GetSize(); if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return; // flag aligned words bool aligned[16]; CHECK(targetLength < 16); for(size_t i=0; i<targetLength; i++) { aligned[i] = false; } for (AlignmentInfo::const_iterator alignmentPoint = alignmentInfo.begin(); alignmentPoint != alignmentInfo.end(); alignmentPoint++) { aligned[ alignmentPoint->second ] = true; } // process unaligned target words for(size_t i=0; i<targetLength; i++) { if (!aligned[i]) { Word w = targetPhrase.GetWord(i); if (!w.IsNonTerminal()) { const StringPiece word = w.GetFactor(m_factorType)->GetString(); if (word != "<s>" && word != "</s>") { if (!m_unrestricted && FindStringPiece(m_vocab, word ) == m_vocab.end()) { accumulator->PlusEquals(this,StringPiece("OTHER"),1); } else { accumulator->PlusEquals(this,word,1); } } } } } }
void LM::Evaluate(const Phrase &source , const TargetPhrase &targetPhrase , Scores &scores , Scores &estimatedFutureScore) const { SCORE all = 0, ngram = 0; PhraseVec phraseVec; phraseVec.reserve(m_order); for (size_t pos = 0; pos < targetPhrase.GetSize(); ++pos) { const Word &word = targetPhrase.GetWord(pos); ShiftOrPush(phraseVec, word); SCORE score = GetValueCache(phraseVec); all += score; if (phraseVec.size() == m_order) { ngram += score; } } SCORE estimated = all - ngram; scores.Add(*this, ngram); estimatedFutureScore.Add(*this, estimated); }
void WordTranslationFeature::EvaluateWithSourceContext(const InputType &input , const InputPath &inputPath , const TargetPhrase &targetPhrase , const StackVec *stackVec , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection *estimatedScores) const { const Sentence& sentence = static_cast<const Sentence&>(input); const AlignmentInfo &alignment = targetPhrase.GetAlignTerm(); // process aligned words for (AlignmentInfo::const_iterator alignmentPoint = alignment.begin(); alignmentPoint != alignment.end(); alignmentPoint++) { const Phrase& sourcePhrase = inputPath.GetPhrase(); int sourceIndex = alignmentPoint->first; int targetIndex = alignmentPoint->second; Word ws = sourcePhrase.GetWord(sourceIndex); if (m_factorTypeSource == 0 && ws.IsNonTerminal()) continue; Word wt = targetPhrase.GetWord(targetIndex); if (m_factorTypeSource == 0 && wt.IsNonTerminal()) continue; StringPiece sourceWord = ws.GetFactor(m_factorTypeSource)->GetString(); StringPiece targetWord = wt.GetFactor(m_factorTypeTarget)->GetString(); if (m_ignorePunctuation) { // check if source or target are punctuation char firstChar = sourceWord[0]; CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; firstChar = targetWord[0]; charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; } if (!m_unrestricted) { if (FindStringPiece(m_vocabSource, sourceWord) == m_vocabSource.end()) sourceWord = "OTHER"; if (FindStringPiece(m_vocabTarget, targetWord) == m_vocabTarget.end()) targetWord = "OTHER"; } if (m_simple) { // construct feature name util::StringStream featureName; featureName << m_description << "_"; featureName << sourceWord; featureName << "~"; featureName << targetWord; scoreBreakdown.SparsePlusEquals(featureName.str(), 1); } if (m_domainTrigger && !m_sourceContext) { const bool use_topicid = sentence.GetUseTopicId(); const bool use_topicid_prob = sentence.GetUseTopicIdAndProb(); if (use_topicid || use_topicid_prob) { if(use_topicid) { // use topicid as trigger const long topicid = sentence.GetTopicId(); util::StringStream feature; feature << m_description << "_"; if (topicid == -1) feature << "unk"; else feature << topicid; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } else { // use topic probabilities const vector<string> &topicid_prob = *(input.GetTopicIdAndProb()); if (atol(topicid_prob[0].c_str()) == -1) { util::StringStream feature; feature << m_description << "_unk_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } else { for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { util::StringStream feature; feature << m_description << "_"; feature << topicid_prob[i]; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str())); } } } } else { // range over domain trigger words (keywords) const long docid = input.GetDocumentId(); for (boost::unordered_set<std::string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) { string sourceTrigger = *p; util::StringStream feature; feature << m_description << "_"; feature << sourceTrigger; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } } } if (m_sourceContext) { size_t globalSourceIndex = inputPath.GetWordsRange().GetStartPos() + sourceIndex; if (!m_domainTrigger && globalSourceIndex == 0) { // add <s> trigger feature for source util::StringStream feature; feature << m_description << "_"; feature << "<s>,"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } // range over source words to get context for(size_t contextIndex = 0; contextIndex < input.GetSize(); contextIndex++ ) { if (contextIndex == globalSourceIndex) continue; StringPiece sourceTrigger = input.GetWord(contextIndex).GetFactor(m_factorTypeSource)->GetString(); if (m_ignorePunctuation) { // check if trigger is punctuation char firstChar = sourceTrigger[0]; CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; } const long docid = input.GetDocumentId(); bool sourceTriggerExists = false; if (m_domainTrigger) sourceTriggerExists = FindStringPiece(m_vocabDomain[docid], sourceTrigger ) != m_vocabDomain[docid].end(); else if (!m_unrestricted) sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); if (m_domainTrigger) { if (sourceTriggerExists) { util::StringStream feature; feature << m_description << "_"; feature << sourceTrigger; feature << "_"; feature << sourceWord; feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } } else if (m_unrestricted || sourceTriggerExists) { util::StringStream feature; feature << m_description << "_"; if (contextIndex < globalSourceIndex) { feature << sourceTrigger; feature << ","; feature << sourceWord; } else { feature << sourceWord; feature << ","; feature << sourceTrigger; } feature << "~"; feature << targetWord; scoreBreakdown.SparsePlusEquals(feature.str(), 1); } } } if (m_targetContext) { throw runtime_error("Can't use target words outside current translation option in a stateless feature"); /* size_t globalTargetIndex = cur_hypo.GetCurrTargetWordsRange().GetStartPos() + targetIndex; if (globalTargetIndex == 0) { // add <s> trigger feature for source stringstream feature; feature << "wt_"; feature << sourceWord; feature << "~"; feature << "<s>,"; feature << targetWord; accumulator->SparsePlusEquals(feature.str(), 1); } // range over target words (up to current position) to get context for(size_t contextIndex = 0; contextIndex < globalTargetIndex; contextIndex++ ) { string targetTrigger = cur_hypo.GetWord(contextIndex).GetFactor(m_factorTypeTarget)->GetString(); if (m_ignorePunctuation) { // check if trigger is punctuation char firstChar = targetTrigger.at(0); CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; } bool targetTriggerExists = false; if (!m_unrestricted) targetTriggerExists = m_vocabTarget.find( targetTrigger ) != m_vocabTarget.end(); if (m_unrestricted || targetTriggerExists) { stringstream feature; feature << "wt_"; feature << sourceWord; feature << "~"; feature << targetTrigger; feature << ","; feature << targetWord; accumulator->SparsePlusEquals(feature.str(), 1); } }*/ } } }
void PhrasePairFeature::EvaluateWithSourceContext(const InputType &input , const InputPath &inputPath , const TargetPhrase &targetPhrase , const StackVec *stackVec , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection *estimatedFutureScore) const { const Phrase& source = inputPath.GetPhrase(); if (m_simple) { ostringstream namestr; namestr << "pp_"; namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); for (size_t i = 1; i < source.GetSize(); ++i) { const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); namestr << ","; namestr << sourceFactor->GetString(); } namestr << "~"; namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString(); for (size_t i = 1; i < targetPhrase.GetSize(); ++i) { const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId); namestr << ","; namestr << targetFactor->GetString(); } scoreBreakdown.SparsePlusEquals(namestr.str(),1); } if (m_domainTrigger) { const Sentence& isnt = static_cast<const Sentence&>(input); const bool use_topicid = isnt.GetUseTopicId(); const bool use_topicid_prob = isnt.GetUseTopicIdAndProb(); // compute pair ostringstream pair; pair << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); for (size_t i = 1; i < source.GetSize(); ++i) { const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); pair << ","; pair << sourceFactor->GetString(); } pair << "~"; pair << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString(); for (size_t i = 1; i < targetPhrase.GetSize(); ++i) { const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId); pair << ","; pair << targetFactor->GetString(); } if (use_topicid || use_topicid_prob) { if(use_topicid) { // use topicid as trigger const long topicid = isnt.GetTopicId(); stringstream feature; feature << "pp_"; if (topicid == -1) feature << "unk"; else feature << topicid; feature << "_"; feature << pair.str(); scoreBreakdown.SparsePlusEquals(feature.str(), 1); } else { // use topic probabilities const vector<string> &topicid_prob = *(isnt.GetTopicIdAndProb()); if (atol(topicid_prob[0].c_str()) == -1) { stringstream feature; feature << "pp_unk_"; feature << pair.str(); scoreBreakdown.SparsePlusEquals(feature.str(), 1); } else { for (size_t i=0; i+1 < topicid_prob.size(); i+=2) { stringstream feature; feature << "pp_"; feature << topicid_prob[i]; feature << "_"; feature << pair.str(); scoreBreakdown.SparsePlusEquals(feature.str(), atof((topicid_prob[i+1]).c_str())); } } } } else { // range over domain trigger words const long docid = isnt.GetDocumentId(); for (set<string>::const_iterator p = m_vocabDomain[docid].begin(); p != m_vocabDomain[docid].end(); ++p) { string sourceTrigger = *p; ostringstream namestr; namestr << "pp_"; namestr << sourceTrigger; namestr << "_"; namestr << pair.str(); scoreBreakdown.SparsePlusEquals(namestr.str(),1); } } } if (m_sourceContext) { const Sentence& isnt = static_cast<const Sentence&>(input); // range over source words to get context for(size_t contextIndex = 0; contextIndex < isnt.GetSize(); contextIndex++ ) { StringPiece sourceTrigger = isnt.GetWord(contextIndex).GetFactor(m_sourceFactorId)->GetString(); if (m_ignorePunctuation) { // check if trigger is punctuation char firstChar = sourceTrigger[0]; CharHash::const_iterator charIterator = m_punctuationHash.find( firstChar ); if(charIterator != m_punctuationHash.end()) continue; } bool sourceTriggerExists = false; if (!m_unrestricted) sourceTriggerExists = FindStringPiece(m_vocabSource, sourceTrigger ) != m_vocabSource.end(); if (m_unrestricted || sourceTriggerExists) { ostringstream namestr; namestr << "pp_"; namestr << sourceTrigger; namestr << "~"; namestr << source.GetWord(0).GetFactor(m_sourceFactorId)->GetString(); for (size_t i = 1; i < source.GetSize(); ++i) { const Factor* sourceFactor = source.GetWord(i).GetFactor(m_sourceFactorId); namestr << ","; namestr << sourceFactor->GetString(); } namestr << "~"; namestr << targetPhrase.GetWord(0).GetFactor(m_targetFactorId)->GetString(); for (size_t i = 1; i < targetPhrase.GetSize(); ++i) { const Factor* targetFactor = targetPhrase.GetWord(i).GetFactor(m_targetFactorId); namestr << ","; namestr << targetFactor->GetString(); } scoreBreakdown.SparsePlusEquals(namestr.str(),1); } } } }