bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &inFile , const std::vector<float> &weight , size_t /* tableLimit */ , const LMList &languageModels , const WordPenaltyProducer* wpProducer , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; std::string hiero_before, hiero_after; while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); ReformatHieroRule(hiero_before, hiero_after); line = hiero_after; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString(*++pipes); // TODO(bhaddow) efficiently handle default instead of parsing this string every time. StringPiece ruleCountString = ++pipes ? *pipes : StringPiece("1 1"); if (++pipes) { stringstream strme; strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count; UserMessage::Add(strme.str()); abort(); } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { char *err_ind; scoreVector.push_back(strtod(s->data(), &err_ind)); UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count); } const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } // parse source & find pt node // constituent labels Word sourceLHS, targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); targetPhrase->SetSourcePhrase(sourcePhrase); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString, sourcePhrase); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetRuleCount(ruleCountString, scoreVector[0]); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }
void PhraseDictionaryFuzzyMatch::InitializeForInput(InputType const& inputSentence) { char dirName[] = "/tmp/moses.XXXXXX"; char *temp = mkdtemp(dirName); UTIL_THROW_IF2(temp == NULL, "Couldn't create temporary directory " << dirName); string dirNameStr(dirName); string inFileName(dirNameStr + "/in"); ofstream inFile(inFileName.c_str()); for (size_t i = 1; i < inputSentence.GetSize() - 1; ++i) { inFile << inputSentence.GetWord(i); } inFile << endl; inFile.close(); long translationId = inputSentence.GetTranslationId(); string ptFileName = m_FuzzyMatchWrapper->Extract(translationId, dirNameStr); // populate with rules for this sentence PhraseDictionaryNodeMemory &rootNode = m_collection[translationId]; FormatType format = MosesFormat; // data from file InputFileStream inStream(ptFileName); // copied from class LoaderStandard PrintUserTime("Start loading fuzzy-match phrase model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line UTIL_THROW(util::Exception, "Cannot be Hiero format"); //line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << ptFileName << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ptFileName << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } UTIL_THROW_IF2(scoreVector.size() != numScoreComponents, "Number of scores incorrectly specified"); // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromString(Input, m_input, sourcePhraseString, factorDelimiter, &sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, m_output, targetPhraseString, factorDelimiter, &targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->GetScoreBreakdown().Assign(this, scoreVector); targetPhrase->Evaluate(sourcePhrase, GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(rootNode, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(rootNode); //removedirectoryrecursively(dirName); }
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { const StaticData &staticData = StaticData::Instance(); m_tableLimit = tableLimit; // data from file InputFileStream inFile(filePath); // create hash file if necessary ofstream tempFile; string tempFilePath; vector< vector<string> > phraseVector; string line, prevSourcePhrase = ""; size_t count = 0; size_t line_num = 0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info while(getline(inFile, line)) { ++line_num; vector<string> tokens = TokenizeMultiCharSeparator( line , "|||" ); if (numElement == NOT_FOUND) { // init numElement numElement = tokens.size(); assert(numElement >= 3); // extended style: source ||| target ||| scores ||| [alignment] ||| [counts] } if (tokens.size() != numElement) { stringstream strme; strme << "Syntax error at " << filePath << ":" << line_num; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString=tokens[0] ,&targetPhraseString=tokens[1] ,&scoreString = tokens[2]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty target, skipping\n"); continue; } const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); if (sourcePhraseString != prevSourcePhrase) phraseVector = Phrase::Parse(sourcePhraseString, input, factorDelimiter); vector<float> scoreVector = Tokenize<float>(scoreString); if (scoreVector.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num; UserMessage::Add(strme.str()); abort(); } // source Phrase sourcePhrase(Input); sourcePhrase.CreateFromString( input, phraseVector); //target TargetPhrase targetPhrase(Output); targetPhrase.SetSourcePhrase(&sourcePhrase); targetPhrase.CreateFromString( output, targetPhraseString, factorDelimiter); if (tokens.size() > 3) targetPhrase.SetAlignmentInfo(tokens[3]); // component score, for n-best output std::vector<float> scv(scoreVector.size()); std::transform(scoreVector.begin(),scoreVector.end(),scv.begin(),TransformScore); std::transform(scv.begin(),scv.end(),scv.begin(),FloorScore); targetPhrase.SetScore(m_feature, scv, weight, weightWP, languageModels); AddEquivPhrase(sourcePhrase, targetPhrase); count++; } // sort each target phrase collection m_collection.Sort(m_tableLimit); return true; }
vector<float> PhraseDictionaryMultiModelCounts::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) { const StaticData &staticData = StaticData::Instance(); const string& factorDelimiter = staticData.GetFactorDelimiter(); map<pair<string, string>, size_t> phrase_pair_map; for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) { phrase_pair_map[*iter] += 1; } vector<multiModelCountsStatisticsOptimization*> optimizerStats; for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) { pair<string, string> phrase_pair = iter->first; string source_string = phrase_pair.first; string target_string = phrase_pair.second; vector<float> fs(m_numModels); map<string,multiModelCountsStatistics*>* allStats = new(map<string,multiModelCountsStatistics*>); Phrase sourcePhrase(0); sourcePhrase.CreateFromString(Input, m_input, source_string, factorDelimiter, NULL); CollectSufficientStatistics(sourcePhrase, fs, allStats); //optimization potential: only call this once per source phrase //phrase pair not found; leave cache empty if (allStats->find(target_string) == allStats->end()) { RemoveAllInMap(*allStats); delete allStats; continue; } multiModelCountsStatisticsOptimization * targetStatistics = new multiModelCountsStatisticsOptimization(); targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase); targetStatistics->fs = fs; targetStatistics->fst = (*allStats)[target_string]->fst; targetStatistics->ft = (*allStats)[target_string]->ft; targetStatistics->f = iter->second; try { pair<vector< set<size_t> >, vector< set<size_t> > > alignment = GetAlignmentsForLexWeights(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), targetStatistics->targetPhrase->GetAlignTerm()); targetStatistics->lexCachee2f = CacheLexicalStatistics(static_cast<const Phrase&>(*targetStatistics->targetPhrase), sourcePhrase, alignment.second, m_lexTable_e2f, false ); targetStatistics->lexCachef2e = CacheLexicalStatistics(sourcePhrase, static_cast<const Phrase&>(*targetStatistics->targetPhrase), alignment.first, m_lexTable_f2e, true ); optimizerStats.push_back(targetStatistics); } catch (AlignmentException& e) {} RemoveAllInMap(*allStats); delete allStats; } Sentence sentence; CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables vector<float> ret (m_numModels*4); for (size_t iFeature=0; iFeature < 4; iFeature++) { CrossEntropyCounts * ObjectiveFunction = new CrossEntropyCounts(optimizerStats, this, iFeature); vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels); if (m_mode == "interpolate") { weight_vector = normalizeWeights(weight_vector); } else if (m_mode == "instance_weighting") { float first_value = weight_vector[0]; for (size_t i=0; i < m_numModels; i++) { weight_vector[i] = weight_vector[i]/first_value; } } cerr << "Weight vector for feature " << iFeature << ": "; for (size_t i=0; i < m_numModels; i++) { ret[(iFeature*m_numModels)+i] = weight_vector[i]; cerr << weight_vector[i] << " "; } cerr << endl; delete ObjectiveFunction; } RemoveAllInColl(optimizerStats); return ret; }
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing(); const StaticData &staticData = StaticData::Instance(); m_tableLimit = tableLimit; util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL); size_t line_num = 0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info const std::string& factorDelimiter = staticData.GetFactorDelimiter(); Phrase sourcePhrase(0); std::vector<float> scv; scv.reserve(m_numScoreComponent); TargetPhraseCollection *preSourceNode = NULL; std::string preSourceString; while(true) { ++line_num; StringPiece line; try { line = inFile.ReadLine(); } catch (util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||")); StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece scoreString(GrabOrDie(pipes, filePath, line_num)); bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t")); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n"); continue; } //target std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase()); targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter); scv.clear(); for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) { char *err_ind; // Token is always delimited by some form of space. Also, apparently strtod is portable but strtof isn't. scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind))))); if (err_ind == token->data()) { stringstream strme; strme << "Bad number " << token << " on line " << line_num; UserMessage::Add(strme.str()); abort(); } } if (scv.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num; UserMessage::Add(strme.str()); abort(); } size_t consumed = 3; if (pipes) { targetPhrase->SetAlignmentInfo(*pipes++); ++consumed; } ScoreComponentCollection sparse; if (pipes) pipes++; //counts if (pipes) { //sparse features SparsePhraseDictionaryFeature* spdf = GetFeature()->GetSparsePhraseDictionaryFeature(); if (spdf) { sparse.Assign(spdf,(pipes++)->as_string()); } } // scv good to go sir! targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels); // Check number of entries delimited by ||| agrees across all lines. for (; pipes; ++pipes, ++consumed) {} if (numElement != consumed) { if (numElement == NOT_FOUND) { numElement = consumed; } else { stringstream strme; strme << "Syntax error at " << filePath << ":" << line_num; UserMessage::Add(strme.str()); abort(); } } //TODO: Would be better to reuse source phrases, but ownership has to be //consistent across phrase table implementations sourcePhrase.Clear(); sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter); //Now that the source phrase is ready, we give the target phrase a copy targetPhrase->SetSourcePhrase(sourcePhrase); if (preSourceString == sourcePhraseString && preSourceNode) { preSourceNode->Add(targetPhrase.release()); } else { preSourceNode = CreateTargetPhraseCollection(sourcePhrase); preSourceNode->Add(targetPhrase.release()); preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size()); } } // sort each target phrase collection m_collection.Sort(m_tableLimit); /* // TODO ASK OLIVER WHY THIS IS NEEDED const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing(); */ return true; }
bool PhraseDictionaryNewFormat::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , std::istream &inStream , const std::vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { PrintUserTime("Start loading new format pt model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); VERBOSE(2,"PhraseDictionaryNewFormat: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl); string line; size_t count = 0; while(getline(inStream, line)) { vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << m_filePath << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &alignString = tokens[2] , &scoreString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( m_filePath << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); if (scoreVector.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } assert(scoreVector.size() == m_numScoreComponent); // parse source & find pt node // head word Word sourceLHS, targetLHS; // source Phrase sourcePhrase(Input); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); // alignment list<pair<size_t,size_t> > alignmentInfo; CreateAlignmentInfo(alignmentInfo, alignString); // rest of target phrase targetPhrase->SetAlignmentInfo(alignmentInfo); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(GetFeature(), scoreVector, weight, languageModels); // count info for backoff if (tokens.size() >= 6) targetPhrase->CreateCountInfo(tokens[5]); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(sourcePhrase, *targetPhrase); AddEquivPhrase(phraseColl, targetPhrase); count++; } // cleanup cache // sort each target phrase collection m_collection.Sort(m_tableLimit); return true; }
vector<float> PhraseDictionaryMultiModel::MinimizePerplexity(vector<pair<string, string> > &phrase_pair_vector) { map<pair<string, string>, size_t> phrase_pair_map; for ( vector<pair<string, string> >::const_iterator iter = phrase_pair_vector.begin(); iter != phrase_pair_vector.end(); ++iter ) { phrase_pair_map[*iter] += 1; } vector<multiModelStatisticsOptimization*> optimizerStats; for ( map<pair<string, string>, size_t>::iterator iter = phrase_pair_map.begin(); iter != phrase_pair_map.end(); ++iter ) { pair<string, string> phrase_pair = iter->first; string source_string = phrase_pair.first; string target_string = phrase_pair.second; vector<float> fs(m_numModels); map<string,multiModelStatistics*>* allStats = new(map<string,multiModelStatistics*>); Phrase sourcePhrase(0); sourcePhrase.CreateFromString(Input, m_input, source_string, NULL); CollectSufficientStatistics(sourcePhrase, allStats); //optimization potential: only call this once per source phrase //phrase pair not found; leave cache empty if (allStats->find(target_string) == allStats->end()) { RemoveAllInMap(*allStats); delete allStats; continue; } multiModelStatisticsOptimization* targetStatistics = new multiModelStatisticsOptimization(); targetStatistics->targetPhrase = new TargetPhrase(*(*allStats)[target_string]->targetPhrase); targetStatistics->p = (*allStats)[target_string]->p; targetStatistics->f = iter->second; optimizerStats.push_back(targetStatistics); RemoveAllInMap(*allStats); delete allStats; } Sentence sentence; CleanUpAfterSentenceProcessing(sentence); // free memory used by compact phrase tables size_t numWeights = m_numScoreComponents; vector<float> ret (m_numModels*numWeights); for (size_t iFeature=0; iFeature < numWeights; iFeature++) { CrossEntropy * ObjectiveFunction = new CrossEntropy(optimizerStats, this, iFeature); vector<float> weight_vector = Optimize(ObjectiveFunction, m_numModels); if (m_mode == "interpolate") { weight_vector = normalizeWeights(weight_vector); } cerr << "Weight vector for feature " << iFeature << ": "; for (size_t i=0; i < m_numModels; i++) { ret[(iFeature*m_numModels)+i] = weight_vector[i]; cerr << weight_vector[i] << " "; } cerr << endl; delete ObjectiveFunction; } RemoveAllInColl(optimizerStats); return ret; }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , std::istream &inStream , const std::vector<float> &weight , size_t /* tableLimit */ , const LMList &languageModels , const WordPenaltyProducer* wpProducer , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } CHECK(scoreVector.size() == numScoreComponents); // parse source & find pt node // constituent labels Word sourceLHS, targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }