bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &inFile , const std::vector<float> &weight , size_t /* tableLimit */ , const LMList &languageModels , const WordPenaltyProducer* wpProducer , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; std::string hiero_before, hiero_after; while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); ReformatHieroRule(hiero_before, hiero_after); line = hiero_after; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString(*++pipes); // TODO(bhaddow) efficiently handle default instead of parsing this string every time. StringPiece ruleCountString = ++pipes ? *pipes : StringPiece("1 1"); if (++pipes) { stringstream strme; strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count; UserMessage::Add(strme.str()); abort(); } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { char *err_ind; scoreVector.push_back(strtod(s->data(), &err_ind)); UTIL_THROW_IF(err_ind == s->data(), util::Exception, "Bad score " << *s << " on line " << count); } const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } // parse source & find pt node // constituent labels Word sourceLHS, targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); targetPhrase->SetSourcePhrase(sourcePhrase); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString, sourcePhrase); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetRuleCount(ruleCountString, scoreVector[0]); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , const std::string &inFile , size_t /* tableLimit */ , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; std::ostream *progress = NULL; IFVERBOSE(1) progress = &std::cerr; util::FilePiece in(inFile.c_str(), progress); // reused variables vector<float> scoreVector; StringPiece line; std::string hiero_before, hiero_after; double_conversion::StringToDoubleConverter converter(double_conversion::StringToDoubleConverter::NO_FLAGS, NAN, NAN, "inf", "nan"); while(true) { try { line = in.ReadLine(); } catch (const util::EndOfFileException &e) { break; } if (format == HieroFormat) { // inefficiently reformat line hiero_before.assign(line.data(), line.size()); ReformatHieroRule(hiero_before, hiero_after); line = hiero_after; } util::TokenIter<util::MultiCharacter> pipes(line, "|||"); StringPiece sourcePhraseString(*pipes); StringPiece targetPhraseString(*++pipes); StringPiece scoreString(*++pipes); StringPiece alignString; if (++pipes) { StringPiece temp(*pipes); alignString = temp; } if (++pipes) { StringPiece str(*pipes); //counts } bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } scoreVector.clear(); for (util::TokenIter<util::AnyCharacter, true> s(scoreString, " \t"); s; ++s) { int processed; float score = converter.StringToFloat(s->data(), s->length(), &processed); UTIL_THROW_IF(isnan(score), util::Exception, "Bad score " << *s << " on line " << count); scoreVector.push_back(FloorScore(TransformScore(score))); } const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } // parse source & find pt node // constituent labels Word *sourceLHS; Word *targetLHS; // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(); targetPhrase->CreateFromString(Output, output, targetPhraseString, factorDelimiter, &targetLHS); // source Phrase sourcePhrase; sourcePhrase.CreateFromString(Input, input, sourcePhraseString, factorDelimiter, &sourceLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); if (++pipes) { StringPiece sparseString(*pipes); targetPhrase->SetSparseScore(&ruleTable, sparseString); } if (++pipes) { StringPiece propertiesString(*pipes); targetPhrase->SetProperties(propertiesString); } targetPhrase->GetScoreBreakdown().Assign(&ruleTable, scoreVector); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }
bool RuleTableLoaderCompact::LoadRuleSection( LineReader &reader, const std::vector<Word> &vocab, const std::vector<Phrase> &sourcePhrases, const std::vector<Phrase> &targetPhrases, const std::vector<size_t> &targetLhsIds, const std::vector<const AlignmentInfo *> &alignmentSets, RuleTableTrie &ruleTable) { // Read rule count. reader.ReadLine(); const size_t ruleCount = std::atoi(reader.m_line.c_str()); // Read rules and add to table. const size_t numScoreComponents = ruleTable.GetNumScoreComponents(); std::vector<float> scoreVector(numScoreComponents); std::vector<size_t> tokenPositions; for (size_t i = 0; i < ruleCount; ++i) { reader.ReadLine(); tokenPositions.clear(); FindTokens(tokenPositions, reader.m_line); const char *charLine = reader.m_line.c_str(); // The first three tokens are IDs for the source phrase, target phrase, // and alignment set. const int sourcePhraseId = std::atoi(charLine+tokenPositions[0]); const int targetPhraseId = std::atoi(charLine+tokenPositions[1]); const int alignmentSetId = std::atoi(charLine+tokenPositions[2]); const Phrase &sourcePhrase = sourcePhrases[sourcePhraseId]; const Phrase &targetPhrasePhrase = targetPhrases[targetPhraseId]; const Word *targetLhs = new Word(vocab[targetLhsIds[targetPhraseId]]); Word sourceLHS("X"); // TODO not implemented for compact const AlignmentInfo *alignNonTerm = alignmentSets[alignmentSetId]; // Then there should be one score for each score component. for (size_t j = 0; j < numScoreComponents; ++j) { float score = std::atof(charLine+tokenPositions[3+j]); scoreVector[j] = FloorScore(TransformScore(score)); } if (reader.m_line[tokenPositions[3+numScoreComponents]] != ':') { std::stringstream msg; msg << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << reader.m_lineNum; UserMessage::Add(msg.str()); return false; } // The remaining columns are currently ignored. // Create and score target phrase. TargetPhrase *targetPhrase = new TargetPhrase(targetPhrasePhrase); targetPhrase->SetAlignNonTerm(alignNonTerm); targetPhrase->SetTargetLHS(targetLhs); targetPhrase->SetSourcePhrase(sourcePhrase); targetPhrase->Evaluate(sourcePhrase, ruleTable.GetFeaturesToApply()); // Insert rule into table. TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection( ruleTable, sourcePhrase, *targetPhrase, &sourceLHS); coll.Add(targetPhrase); } return true; }
PhraseDictionary* PhraseDictionaryFeature::LoadPhraseTable(const TranslationSystem* system) { const StaticData& staticData = StaticData::Instance(); if (m_implementation == Memory) { // memory phrase table VERBOSE(2,"using standard phrase tables" << std::endl); if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) { m_filePath += ".gz"; VERBOSE(2,"Using gzipped file" << std::endl); } if (staticData.GetInputType() != SentenceInput) { UserMessage::Add("Must use binary phrase table for this input type"); CHECK(false); } PhraseDictionaryMemory* pdm = new PhraseDictionaryMemory(m_numScoreComponent,this); bool ret = pdm->Load(GetInput(), GetOutput() , m_filePath , m_weight , m_tableLimit , system->GetLanguageModels() , system->GetWeightWordPenalty()); CHECK(ret); return pdm; } else if (m_implementation == Binary) { PhraseDictionaryTreeAdaptor* pdta = new PhraseDictionaryTreeAdaptor(m_numScoreComponent, m_numInputScores,this); bool ret = pdta->Load( GetInput() , GetOutput() , m_filePath , m_weight , m_tableLimit , system->GetLanguageModels() , system->GetWeightWordPenalty()); CHECK(ret); return pdta; } else if (m_implementation == SCFG || m_implementation == Hiero) { // memory phrase table if (m_implementation == Hiero) { VERBOSE(2,"using Hiero format phrase tables" << std::endl); } else { VERBOSE(2,"using Moses-formatted SCFG phrase tables" << std::endl); } if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) { m_filePath += ".gz"; VERBOSE(2,"Using gzipped file" << std::endl); } RuleTableTrie *dict; if (staticData.GetParsingAlgorithm() == ParseScope3) { dict = new RuleTableUTrie(m_numScoreComponent, this); } else { dict = new PhraseDictionarySCFG(m_numScoreComponent, this); } bool ret = dict->Load(GetInput() , GetOutput() , m_filePath , m_weight , m_tableLimit , system->GetLanguageModels() , system->GetWordPenaltyProducer()); assert(ret); return dict; } else if (m_implementation == ALSuffixArray) { // memory phrase table VERBOSE(2,"using Hiero format phrase tables" << std::endl); if (!FileExists(m_filePath) && FileExists(m_filePath + ".gz")) { m_filePath += ".gz"; VERBOSE(2,"Using gzipped file" << std::endl); } PhraseDictionaryALSuffixArray* pdm = new PhraseDictionaryALSuffixArray(m_numScoreComponent,this); bool ret = pdm->Load(GetInput() , GetOutput() , m_filePath , m_weight , m_tableLimit , system->GetLanguageModels() , system->GetWordPenaltyProducer()); CHECK(ret); return pdm; } else if (m_implementation == OnDisk) { PhraseDictionaryOnDisk* pdta = new PhraseDictionaryOnDisk(m_numScoreComponent, this); bool ret = pdta->Load(GetInput() , GetOutput() , m_filePath , m_weight , m_tableLimit , system->GetLanguageModels() , system->GetWordPenaltyProducer()); CHECK(ret); return pdta; } else if (m_implementation == SuffixArray) { #ifndef WIN32 PhraseDictionaryDynSuffixArray *pd = new PhraseDictionaryDynSuffixArray(m_numScoreComponent, this); if(!(pd->Load( GetInput() ,GetOutput() ,m_filePath ,m_targetFile , m_alignmentsFile , m_weight, m_tableLimit , system->GetLanguageModels() , system->GetWeightWordPenalty()))) { std::cerr << "FAILED TO LOAD\n" << endl; delete pd; pd = NULL; } std::cerr << "Suffix array phrase table loaded" << std::endl; return pd; #else CHECK(false); #endif } else if (m_implementation == FuzzyMatch) { PhraseDictionaryFuzzyMatch *dict = new PhraseDictionaryFuzzyMatch(m_numScoreComponent, this); bool ret = dict->Load(GetInput() , GetOutput() , m_filePath , m_weight , m_tableLimit , system->GetLanguageModels() , system->GetWordPenaltyProducer()); assert(ret); return dict; } else if (m_implementation == Compact) { #ifndef WIN32 VERBOSE(2,"Using compact phrase table" << std::endl); PhraseDictionaryCompact* pd = new PhraseDictionaryCompact(m_numScoreComponent, m_implementation, this); bool ret = pd->Load(GetInput(), GetOutput() , m_filePath , m_weight , m_tableLimit , system->GetLanguageModels() , system->GetWeightWordPenalty()); assert(ret); return pd; #else CHECK(false); #endif } else { std::cerr << "Unknown phrase table type " << m_implementation << endl; CHECK(false); } }
bool RuleTableLoaderStandard::Load(FormatType format , const std::vector<FactorType> &input , const std::vector<FactorType> &output , std::istream &inStream , const std::vector<float> &weight , size_t /* tableLimit */ , const LMList &languageModels , const WordPenaltyProducer* wpProducer , RuleTableTrie &ruleTable) { PrintUserTime(string("Start loading text SCFG phrase table. ") + (format==MosesFormat?"Moses ":"Hiero ") + " format"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); string lineOrig; size_t count = 0; while(getline(inStream, lineOrig)) { const string *line; if (format == HieroFormat) { // reformat line line = ReformatHieroRule(lineOrig); } else { // do nothing to format of line line = &lineOrig; } vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, *line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << ruleTable.GetFilePath() << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &scoreString = tokens[2] , &alignString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( ruleTable.GetFilePath() << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); const size_t numScoreComponents = ruleTable.GetFeature()->GetNumScoreComponents(); if (scoreVector.size() != numScoreComponents) { stringstream strme; strme << "Size of scoreVector != number (" << scoreVector.size() << "!=" << numScoreComponents << ") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } CHECK(scoreVector.size() == numScoreComponents); // parse source & find pt node // constituent labels Word sourceLHS, targetLHS; // source Phrase sourcePhrase( 0); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); // rest of target phrase targetPhrase->SetAlignmentInfo(alignString); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weight, languageModels,wpProducer); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(ruleTable, sourcePhrase, *targetPhrase, sourceLHS); phraseColl.Add(targetPhrase); count++; if (format == HieroFormat) { // reformat line delete line; } else { // do nothing } } // sort and prune each target phrase collection SortAndPrune(ruleTable); return true; }