TargetPhraseCollection* PhraseDictionaryMultiModel::CreateTargetPhraseCollectionLinearInterpolation(const Phrase& src, std::map<std::string,multiModelStatistics*>* allStats, std::vector<std::vector<float> > &multimodelweights) const { TargetPhraseCollection *ret = new TargetPhraseCollection(); for ( std::map< std::string, multiModelStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelStatistics * statistics = iter->second; Scores scoreVector(m_numScoreComponents); for(size_t i = 0; i < m_numScoreComponents; ++i) { scoreVector[i] = TransformScore(std::inner_product(statistics->p[i].begin(), statistics->p[i].end(), multimodelweights[i].begin(), 0.0)); } statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); //correct future cost estimates and total score vector<FeatureFunction*> pd_feature; pd_feature.push_back(const_cast<PhraseDictionaryMultiModel*>(this)); const vector<FeatureFunction*> pd_feature_const(pd_feature); statistics->targetPhrase->EvaluateInIsolation(src, pd_feature_const); ret->Add(new TargetPhrase(*statistics->targetPhrase)); } return ret; }
void TargetPhrase::SetScore(float score) { //we use an existing score producer to figure out information for score setting (number of scores and weights) //TODO: is this a good idea? // Assume the default system. const TranslationSystem& system = StaticData::Instance().GetTranslationSystem(TranslationSystem::DEFAULT); const ScoreProducer* prod = system.GetPhraseDictionaries()[0]; //get the weight list unsigned int id = prod->GetScoreBookkeepingID(); const vector<float> &allWeights = StaticData::Instance().GetAllWeights(); size_t beginIndex = StaticData::Instance().GetScoreIndexManager().GetBeginIndex(id); size_t endIndex = StaticData::Instance().GetScoreIndexManager().GetEndIndex(id); vector<float> weights; std::copy(allWeights.begin() +beginIndex, allWeights.begin() + endIndex,std::back_inserter(weights)); //find out how many items are in the score vector for this producer size_t numScores = prod->GetNumScoreComponents(); //divide up the score among all of the score vectors vector <float> scoreVector(numScores,score/numScores); //Now we have what we need to call the full SetScore method SetScore(prod,scoreVector,weights,system.GetWeightWordPenalty(),system.GetLanguageModels()); }
void PhraseDictionaryMultiModelCounts::CollectSufficientStatistics(const Phrase& src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats) const //fill fs and allStats with statistics from models { for(size_t i = 0; i < m_numModels; ++i) { const PhraseDictionary &pd = *m_pd[i]; TargetPhraseCollection *ret_raw = (TargetPhraseCollection*) pd.GetTargetPhraseCollection( src); if (ret_raw != NULL) { TargetPhraseCollection::iterator iterTargetPhrase; for (iterTargetPhrase = ret_raw->begin(); iterTargetPhrase != ret_raw->end(); ++iterTargetPhrase) { TargetPhrase * targetPhrase = *iterTargetPhrase; vector<float> raw_scores = targetPhrase->GetScoreBreakdown().GetScoresForProducer(&pd); string targetString = targetPhrase->GetStringRep(m_output); if (allStats->find(targetString) == allStats->end()) { multiModelCountsStatistics * statistics = new multiModelCountsStatistics; statistics->targetPhrase = new TargetPhrase(*targetPhrase); //make a copy so that we don't overwrite the original phrase table info // zero out scores from original phrase table statistics->targetPhrase->GetScoreBreakdown().ZeroDenseFeatures(&pd); statistics->fst.resize(m_numModels); statistics->ft.resize(m_numModels); Scores scoreVector(5); scoreVector[0] = -raw_scores[0]; scoreVector[1] = -raw_scores[1]; scoreVector[2] = -raw_scores[2]; statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); // set scores to 0 statistics->targetPhrase->Evaluate(src, GetFeaturesToApply()); (*allStats)[targetString] = statistics; } multiModelCountsStatistics * statistics = (*allStats)[targetString]; statistics->fst[i] = UntransformScore(raw_scores[0]); statistics->ft[i] = UntransformScore(raw_scores[1]); fs[i] = UntransformScore(raw_scores[2]); (*allStats)[targetString] = statistics; } } } // get target phrase frequency for models which have not seen the phrase pair for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelCountsStatistics * statistics = iter->second; for (size_t i = 0; i < m_numModels; ++i) { if (!statistics->ft[i]) { statistics->ft[i] = GetTargetCount(static_cast<const Phrase&>(*statistics->targetPhrase), i); } } } }
void TargetPhrase::SetXMLScore(float score) { const StaticData &staticData = StaticData::Instance(); const FeatureFunction* prod = PhraseDictionary::GetColl()[0]; size_t numScores = prod->GetNumScoreComponents(); vector <float> scoreVector(numScores,score/numScores); m_scoreBreakdown.Assign(prod, scoreVector); }
void TargetPhrase::SetScore(float score) { //we use an existing score producer to figure out information for score setting (number of scores and weights) //TODO: is this a good idea? // Assume the default system. const TranslationSystem& system = StaticData::Instance().GetTranslationSystem(TranslationSystem::DEFAULT); const ScoreProducer* prod = system.GetPhraseDictionaries()[0]; vector<float> weights = StaticData::Instance().GetWeights(prod); //find out how many items are in the score vector for this producer size_t numScores = prod->GetNumScoreComponents(); //divide up the score among all of the score vectors vector <float> scoreVector(numScores,score/numScores); //Now we have what we need to call the full SetScore method SetScore(prod, scoreVector, ScoreComponentCollection(), weights, system.GetWeightWordPenalty(), system.GetLanguageModels()); }
TargetPhraseCollection* PhraseDictionaryMultiModelCounts::CreateTargetPhraseCollectionCounts(const Phrase &src, vector<float> &fs, map<string,multiModelCountsStatistics*>* allStats, vector<vector<float> > &multimodelweights) const { TargetPhraseCollection *ret = new TargetPhraseCollection(); for ( map< string, multiModelCountsStatistics*>::const_iterator iter = allStats->begin(); iter != allStats->end(); ++iter ) { multiModelCountsStatistics * statistics = iter->second; if (statistics->targetPhrase->GetAlignTerm().GetSize() == 0) { UTIL_THROW(util::Exception, " alignment information empty\ncount-tables need to include alignment information for computation of lexical weights.\nUse --phrase-word-alignment during training; for on-disk tables, also set -alignment-info when creating on-disk tables."); } try { pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(src, static_cast<const Phrase&>(*statistics->targetPhrase), statistics->targetPhrase->GetAlignTerm()); vector< set<size_t> > alignedToT = alignment.first; vector< set<size_t> > alignedToS = alignment.second; double lexst = ComputeWeightedLexicalTranslation(static_cast<const Phrase&>(*statistics->targetPhrase), src, alignedToS, m_lexTable_e2f, multimodelweights[1], false ); double lexts = ComputeWeightedLexicalTranslation(src, static_cast<const Phrase&>(*statistics->targetPhrase), alignedToT, m_lexTable_f2e, multimodelweights[3], true ); Scores scoreVector(5); scoreVector[0] = FloorScore(TransformScore(m_combineFunction(statistics->fst, statistics->ft, multimodelweights[0]))); scoreVector[1] = FloorScore(TransformScore(lexst)); scoreVector[2] = FloorScore(TransformScore(m_combineFunction(statistics->fst, fs, multimodelweights[2]))); scoreVector[3] = FloorScore(TransformScore(lexts)); scoreVector[4] = FloorScore(TransformScore(2.718)); statistics->targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); statistics->targetPhrase->Evaluate(src, GetFeaturesToApply()); } catch (AlignmentException& e) { continue; } ret->Add(new TargetPhrase(*statistics->targetPhrase)); } RemoveAllInMap(*allStats); delete allStats; return ret; }
bool RuleTableLoaderCompact::LoadRuleSection( LineReader &reader, const std::vector<Word> &vocab, const std::vector<Phrase> &sourcePhrases, const std::vector<Phrase> &targetPhrases, const std::vector<size_t> &targetLhsIds, const std::vector<const AlignmentInfo *> &alignmentSets, RuleTableTrie &ruleTable) { // Read rule count. reader.ReadLine(); const size_t ruleCount = std::atoi(reader.m_line.c_str()); // Read rules and add to table. const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); std::vector<float> scoreVector(numScoreComponents); std::vector<size_t> tokenPositions; for (size_t i = 0; i < ruleCount; ++i) { reader.ReadLine(); tokenPositions.clear(); FindTokens(tokenPositions, reader.m_line); const char *charLine = reader.m_line.c_str(); // The first three tokens are IDs for the source phrase, target phrase, // and alignment set. const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]); const int targetPhraseId = std::atoi(charLine+tokenPositions[1]); const int alignmentSetId = std::atoi(charLine+tokenPositions[2]); const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId]; const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId]; const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]); Word sourceLHS("X"); // TODO not implemented for compact const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId]; // Then there should be one score for each score component. for (size_t j = 0; j < numScoreComponents; ++j) { float score = std::atof(charLine+tokenPositions[3+j]); scoreVector[j] = FloorScore(TransformScore(score)); } if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') { std::stringstream msg; msg << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << reader.m_lineNum; UserMessage::Add(msg.str()); return false; } // The remaining columns are currently ignored. // Create and score target phrase. TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase); targetPhrase->SetAlignNonTerm(alignNonTerm); targetPhrase->SetTargetLHS(targetLhs); targetPhrase->SetSourcePhrase(sourcePhrase); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); // Insert rule into table. TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection( ruleTable, sourcePhrase, *targetPhrase, &sourceLHS); coll.Add(targetPhrase); } return true; }