//Find top n translations of source, and send them to output static void outputTopN(const StringPiece& sourcePhraseString, PhraseDictionary* phraseTable, const std::vector<FactorType> &input, ostream& out) { //get list of target phrases Phrase sourcePhrase; sourcePhrase.CreateFromString(Input,input,sourcePhraseString,NULL); InputPath inputPath(sourcePhrase, NonTerminalSet(), WordsRange(0,sourcePhrase.GetSize()-1),NULL,NULL); InputPathList inputPaths; inputPaths.push_back(&inputPath); phraseTable->GetTargetPhraseCollectionBatch(inputPaths); const TargetPhraseCollection* targetPhrases = inputPath.GetTargetPhrases(*phraseTable); //print phrases const std::vector<FactorType>& output = StaticData::Instance().GetOutputFactorOrder(); if (targetPhrases) { //if (targetPhrases->GetSize() > 10) cerr << "src " << sourcePhrase << " tgt count " << targetPhrases->GetSize() << endl; for (TargetPhraseCollection::const_iterator i = targetPhrases->begin(); i != targetPhrases->end(); ++i) { const TargetPhrase* targetPhrase = *i; out << sourcePhrase.GetStringRep(input); out << " ||| "; out << targetPhrase->GetStringRep(output); out << " ||| "; const ScoreComponentCollection scores = targetPhrase->GetScoreBreakdown(); vector<float> phraseScores = scores.GetScoresForProducer(phraseTable); for (size_t j = 0; j < phraseScores.size(); ++j) { out << exp(phraseScores[j]) << " "; } out << "||| "; const AlignmentInfo& align = targetPhrase->GetAlignTerm(); for (AlignmentInfo::const_iterator j = align.begin(); j != align.end(); ++j) { out << j->first << "-" << j->second << " "; } out << endl; } } }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &inFile , size_t /* tableLimit */ , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; std::string hiero_before, hiero_after; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); ReformatHieroRule(hiero_before, hiero_after); line = hiero_after; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } if (++pipes) { StringPiece str(*pipes); //counts } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); // source Phrase sourcePhrase; sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ruleTable, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }
bool HyperTreeLoader::Load(AllOptions const& opts, const std::vector<FactorType> &input, const std::vector<FactorType> &output, const std::string &inFile, const RuleTableFF &ff, HyperTree &trie, boost::unordered_set<std::size_t> &sourceTermSet) { PrintUserTime(std::string("Start loading HyperTree")); sourceTermSet.clear(); std::size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables std::vector<float> scoreVector; StringPiece line; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); HyperPathLoader hyperPathLoader; Phrase dummySourcePhrase; { Word *lhs = NULL; dummySourcePhrase.CreateFromString(Input, input, "hello", &lhs); delete lhs; } while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourceString(*pipes); StringPiece targetString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } ++pipes; // counts scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const std::size_t numScoreComponents = ff.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } // Source-side HyperPath sourceFragment; hyperPathLoader.Load(sourceString, sourceFragment); ExtractSourceTerminalSetFromHyperPath(sourceFragment, sourceTermSet); // Target-side TargetPhrase *targetPhrase = new TargetPhrase(&ff); Word *targetLHS = NULL; targetPhrase->CreateFromString(Output, output, targetString, &targetLHS); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo(alignString); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ff, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector); targetPhrase->EvaluateInIsolation(dummySourcePhrase, ff.GetFeaturesToApply()); // Add rule to trie. TargetPhraseCollection::shared_ptr phraseColl = GetOrCreateTargetPhraseCollection(trie, sourceFragment); phraseColl->Add(targetPhrase); count++; } // sort and prune each target phrase collection if (ff.GetTableLimit()) { SortAndPrune(trie, ff.GetTableLimit()); } return true; }
bool RuleTrieLoader::Load(const std::vector<FactorType> &input, const std::vector<FactorType> &output, const std::string &inFile, const RuleTableFF &ff, RuleTrie &trie) { PrintUserTime(std::string("Start loading text phrase table. Moses format")); const StaticData &staticData = StaticData::Instance(); // const std::string &factorDelimiter = staticData.GetFactorDelimiter(); std::size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables std::vector<float> scoreVector; StringPiece line; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } if (++pipes) { StringPiece str(*pipes); //counts } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == std::string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ff.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF2(std::isnan(score), "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const std::size_t numScoreComponents = ff.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { UTIL_THROW2("Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count); } // parse source & find pt node // constituent labels Word *sourceLHS = NULL; Word *targetLHS; // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(&ff); // targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); targetPhrase->CreateFromString(Output, output, targetPhraseString, &targetLHS); // source Phrase sourcePhrase; // sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); sourcePhrase.CreateFromString(Input, input, sourcePhraseString, &sourceLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ff, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ff, scoreVector); targetPhrase->EvaluateInIsolation(sourcePhrase, ff.GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection( trie, *sourceLHS, sourcePhrase); phraseColl.Add(targetPhrase); // not implemented correctly in memory pt. just delete it for now delete sourceLHS; count++; } // sort and prune each target phrase collection if (ff.GetTableLimit()) { SortAndPrune(trie, ff.GetTableLimit()); } return true; }
int main(int argc, char **argv) { int nscores = 5; std::string ttable = ""; bool useAlignments = false; bool reportCounts = false; for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-n")) { if(i + 1 == argc) usage(); nscores = atoi(argv[++i]); } else if(!strcmp(argv[i], "-t")) { if(i + 1 == argc) usage(); ttable = argv[++i]; } else if(!strcmp(argv[i], "-a")) { useAlignments = true; } else if (!strcmp(argv[i], "-c")) { reportCounts = true; } else usage(); } if(ttable == "") usage(); std::vector<FactorType> input(1, 0); std::vector<FactorType> output(1, 0); std::vector<float> weight(nscores, 0); LMList lmList; Parameter *parameter = new Parameter(); const_cast<std::vector<std::string>&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||"); const_cast<std::vector<std::string>&>(parameter->GetParam("input-factors")).resize(1, "0"); const_cast<std::vector<std::string>&>(parameter->GetParam("verbose")).resize(1, "0"); const_cast<std::vector<std::string>&>(parameter->GetParam("weight-w")).resize(1, "0"); const_cast<std::vector<std::string>&>(parameter->GetParam("weight-d")).resize(1, "0"); StaticData::InstanceNonConst().LoadData(parameter); SparsePhraseDictionaryFeature *spdf = NULL; PhraseDictionaryFeature pdf(Compact, spdf, nscores, nscores, input, output, ttable, weight, 0, 0, "", ""); PhraseDictionaryCompact pdc(nscores, Compact, &pdf, false, useAlignments); bool ret = pdc.Load(input, output, ttable, weight, 0, lmList, 0); assert(ret); std::string line; while(getline(std::cin, line)) { Phrase sourcePhrase; sourcePhrase.CreateFromString(input, line, "||dummy_string||"); TargetPhraseVectorPtr decodedPhraseColl = pdc.GetTargetPhraseCollectionRaw(sourcePhrase); if(decodedPhraseColl != NULL) { if(reportCounts) std::cout << sourcePhrase << decodedPhraseColl->size() << std::endl; else for(TargetPhraseVector::iterator it = decodedPhraseColl->begin(); it != decodedPhraseColl->end(); it++) { TargetPhrase &tp = *it; std::cout << sourcePhrase << "||| "; std::cout << static_cast<const Phrase&>(tp) << "|||"; if(useAlignments) std::cout << " " << tp.GetAlignTerm() << "|||"; std::vector<float> scores = tp.GetScoreBreakdown().GetScoresForProducer(&pdf); for(size_t i = 0; i < scores.size(); i++) std::cout << " " << exp(scores[i]); std::cout << std::endl; } } else if(reportCounts) std::cout << sourcePhrase << 0 << std::endl; std::cout.flush(); } }
void testCalcScore() { double p_the = -1.383059; double p_licenses = -2.360783; double p_for = -1.661813; double p_most = -2.360783; // double p_software = -1.62042; double p_the_licenses = -0.9625873; double p_licenses_for = -1.661557; double p_for_most = -0.4526253; // double p_most_software = -1.70295; double p_the_licenses_for = p_the_licenses + p_licenses_for; // double p_licenses_for_most = p_licenses_for + p_for_most; // the { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 1 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore(p_the), fullScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); } // the licenses { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the licenses", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 2 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore(p_licenses + p_the_licenses), fullScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore( 0.0 ), ngramScore, 0.01); } // the licenses for { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the licenses for", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 3 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses_for ), ngramScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore(p_for + p_licenses_for + p_the_licenses), fullScore, 0.01); } // the licenses for most { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the licenses for most", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 4 ); float fullScore; float ngramScore; size_t oovCount; backwardLM->CalcScore(phrase, fullScore, ngramScore, oovCount); BOOST_CHECK( oovCount == 0 ); SLOPPY_CHECK_CLOSE( TransformLMScore( p_the_licenses + p_licenses_for ), ngramScore, 0.01); SLOPPY_CHECK_CLOSE( TransformLMScore(p_most + p_for_most + p_licenses_for + p_the_licenses), fullScore, 0.01); } }
void testEvaluate() { FFState *nextState; FFState *prevState = const_cast< FFState * >(backwardLM->EmptyHypothesisState( *dummyInput )); double p_most = -2.360783; double p_for = -1.661813; double p_licenses = -2.360783; double p_the = -1.383059; double p_eos = -1.457693; double p_most_for = -0.4526253; double p_for_licenses = -1.661557; double p_licenses_the = -0.9625873; double p_the_eos = -1.940311; // the { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "the", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 1 ); float score; nextState = backwardLM->Evaluate(phrase, prevState, score); // p(the) * p(</s> | the) / p(</s>) SLOPPY_CHECK_CLOSE( (p_the + p_the_eos - p_eos), score, 0.01); delete prevState; prevState = nextState; } // the licenses { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "licenses", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 1 ); float score; nextState = backwardLM->Evaluate(phrase, prevState, score); // p(licenses) * p(licenses | the) / p(the) SLOPPY_CHECK_CLOSE( (p_licenses + p_licenses_the - p_the), score, 0.01); delete prevState; prevState = nextState; } // the licenses for { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "for", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 1 ); float score; nextState = backwardLM->Evaluate(phrase, prevState, score); // p(for) * p(for | licenses) / p(licenses) SLOPPY_CHECK_CLOSE( (p_for + p_for_licenses - p_licenses), score, 0.01); delete prevState; prevState = nextState; } // the licenses for most { Phrase phrase; BOOST_CHECK( phrase.GetSize() == 0 ); std::vector<FactorType> outputFactorOrder; outputFactorOrder.push_back(0); phrase.CreateFromString( outputFactorOrder, "most", StaticData::Instance().GetFactorDelimiter()); BOOST_CHECK( phrase.GetSize() == 1 ); float score; nextState = backwardLM->Evaluate(phrase, prevState, score); // p(most) * p(most | for) / p(for) SLOPPY_CHECK_CLOSE( (p_most + p_most_for - p_for), score, 0.01); delete prevState; prevState = nextState; } delete prevState; }