bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { const StaticData &staticData = StaticData::Instance(); m_tableLimit = tableLimit; // data from file InputFileStream inFile(filePath); // create hash file if necessary ofstream tempFile; string tempFilePath; vector< vector<string> > phraseVector; string line, prevSourcePhrase = ""; size_t count = 0; size_t line_num = 0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info while(getline(inFile, line)) { ++line_num; vector<string> tokens = TokenizeMultiCharSeparator( line , "|||" ); if (numElement == NOT_FOUND) { // init numElement numElement = tokens.size(); assert(numElement >= 3); // extended style: source ||| target ||| scores ||| [alignment] ||| [counts] } if (tokens.size() != numElement) { stringstream strme; strme << "Syntax error at " << filePath << ":" << line_num; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString=tokens[0] ,&targetPhraseString=tokens[1] ,&scoreString = tokens[2]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty target, skipping\n"); continue; } const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter(); if (sourcePhraseString != prevSourcePhrase) phraseVector = Phrase::Parse(sourcePhraseString, input, factorDelimiter); vector<float> scoreVector = Tokenize<float>(scoreString); if (scoreVector.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num; UserMessage::Add(strme.str()); abort(); } // source Phrase sourcePhrase(Input); sourcePhrase.CreateFromString( input, phraseVector); //target TargetPhrase targetPhrase(Output); targetPhrase.SetSourcePhrase(&sourcePhrase); targetPhrase.CreateFromString( output, targetPhraseString, factorDelimiter); if (tokens.size() > 3) targetPhrase.SetAlignmentInfo(tokens[3]); // component score, for n-best output std::vector<float> scv(scoreVector.size()); std::transform(scoreVector.begin(),scoreVector.end(),scv.begin(),TransformScore); std::transform(scv.begin(),scv.end(),scv.begin(),FloorScore); targetPhrase.SetScore(m_feature, scv, weight, weightWP, languageModels); AddEquivPhrase(sourcePhrase, targetPhrase); count++; } // sort each target phrase collection m_collection.Sort(m_tableLimit); return true; }
bool PhraseDictionaryNewFormat::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , std::istream &inStream , const std::vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { PrintUserTime("Start loading new format pt model"); const StaticData &staticData = StaticData::Instance(); const std::string& factorDelimiter = staticData.GetFactorDelimiter(); VERBOSE(2,"PhraseDictionaryNewFormat: input=" << m_inputFactors << " output=" << m_outputFactors << std::endl); string line; size_t count = 0; while(getline(inStream, line)) { vector<string> tokens; vector<float> scoreVector; TokenizeMultiCharSeparator(tokens, line , "|||" ); if (tokens.size() != 4 && tokens.size() != 5) { stringstream strme; strme << "Syntax error at " << m_filePath << ":" << count; UserMessage::Add(strme.str()); abort(); } const string &sourcePhraseString = tokens[0] , &targetPhraseString = tokens[1] , &alignString = tokens[2] , &scoreString = tokens[3]; bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( m_filePath << ":" << count << ": pt entry contains empty target, skipping\n"); continue; } Tokenize<float>(scoreVector, scoreString); if (scoreVector.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scoreVector.size() << "!=" <<m_numScoreComponent<<") of score components on line " << count; UserMessage::Add(strme.str()); abort(); } assert(scoreVector.size() == m_numScoreComponent); // parse source & find pt node // head word Word sourceLHS, targetLHS; // source Phrase sourcePhrase(Input); sourcePhrase.CreateFromStringNewFormat(Input, input, sourcePhraseString, factorDelimiter, sourceLHS); // create target phrase obj TargetPhrase *targetPhrase = new TargetPhrase(Output); targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS); // alignment list<pair<size_t,size_t> > alignmentInfo; CreateAlignmentInfo(alignmentInfo, alignString); // rest of target phrase targetPhrase->SetAlignmentInfo(alignmentInfo); targetPhrase->SetTargetLHS(targetLHS); //targetPhrase->SetDebugOutput(string("New Format pt ") + line); // component score, for n-best output std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),TransformScore); std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore); targetPhrase->SetScoreChart(GetFeature(), scoreVector, weight, languageModels); // count info for backoff if (tokens.size() >= 6) targetPhrase->CreateCountInfo(tokens[5]); TargetPhraseCollection &phraseColl = GetOrCreateTargetPhraseCollection(sourcePhrase, *targetPhrase); AddEquivPhrase(phraseColl, targetPhrase); count++; } // cleanup cache // sort each target phrase collection m_collection.Sort(m_tableLimit); return true; }