void PhraseDictionaryMemory::AddEquivPhrase(const Phrase &source, const TargetPhrase &targetPhrase) { TargetPhraseCollection &phraseColl = *CreateTargetPhraseCollection(source); phraseColl.Add(new TargetPhrase(targetPhrase)); }
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); typedef std::pair<size_t, size_t> AlignPointSizeT; std::vector<int> sourceWords; if(m_coding == REnc) { for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); std::vector<float> scores; std::set<AlignPointSizeT> alignment; enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; size_t srcSize = sourcePhrase.GetSize(); TargetPhrase* targetPhrase = NULL; while(encodedBitStream.TellFromEnd()) { if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase(Output)); targetPhrase = &tpv->back(); targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); state = Symbol; } if(state == Symbol) { unsigned symbol = m_symbolTree->Read(encodedBitStream); if(symbol == phraseStopSymbol) { state = Score; } else { if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); } else if(m_coding == PREnc) { // if the symbol is just a word if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); // false positive consistency check if(m_maxRank && rank > m_maxRank) return TargetPhraseVectorPtr(); // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; // if range smaller than source phrase retrieve subphrase if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); } // false positive consistency check if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); it != subTp.GetAlignmentInfo().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); } else return TargetPhraseVectorPtr(); } } else { Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(symbol), false); targetPhrase->AddWord(word); } } } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); if(scores.size() == m_numScoreComponent) { targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels); if(m_containsAlignmentInfo) state = Alignment; else state = Add; } } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); if(alignPoint == alignStopSymbol) { state = Add; } else { if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) targetPhrase->SetAlignmentInfo(alignment); if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } if(encodedBitStream.TellFromEnd() <= 8) break; state = New; } } if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } return tpv; }
bool PhraseDictionaryMemory::Load(const std::vector<FactorType> &input , const std::vector<FactorType> &output , const string &filePath , const vector<float> &weight , size_t tableLimit , const LMList &languageModels , float weightWP) { const_cast<LMList&>(languageModels).InitializeBeforeSentenceProcessing(); const StaticData &staticData = StaticData::Instance(); m_tableLimit = tableLimit; util::FilePiece inFile(filePath.c_str(), staticData.GetVerboseLevel() >= 1 ? &std::cerr : NULL); size_t line_num = 0; size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info const std::string& factorDelimiter = staticData.GetFactorDelimiter(); Phrase sourcePhrase(0); std::vector<float> scv; scv.reserve(m_numScoreComponent); TargetPhraseCollection *preSourceNode = NULL; std::string preSourceString; while(true) { ++line_num; StringPiece line; try { line = inFile.ReadLine(); } catch (util::EndOfFileException &e) { break; } util::TokenIter<util::MultiCharacter> pipes(line, util::MultiCharacter("|||")); StringPiece sourcePhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece targetPhraseString(GrabOrDie(pipes, filePath, line_num)); StringPiece scoreString(GrabOrDie(pipes, filePath, line_num)); bool isLHSEmpty = !util::TokenIter<util::AnyCharacter, true>(sourcePhraseString, util::AnyCharacter(" \t")); if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) { TRACE_ERR( filePath << ":" << line_num << ": pt entry contains empty source, skipping\n"); continue; } //target std::auto_ptr<TargetPhrase> targetPhrase(new TargetPhrase()); targetPhrase->CreateFromString(output, targetPhraseString, factorDelimiter); scv.clear(); for (util::TokenIter<util::AnyCharacter, true> token(scoreString, util::AnyCharacter(" \t")); token; ++token) { char *err_ind; // Token is always delimited by some form of space. Also, apparently strtod is portable but strtof isn't. scv.push_back(FloorScore(TransformScore(static_cast<float>(strtod(token->data(), &err_ind))))); if (err_ind == token->data()) { stringstream strme; strme << "Bad number " << token << " on line " << line_num; UserMessage::Add(strme.str()); abort(); } } if (scv.size() != m_numScoreComponent) { stringstream strme; strme << "Size of scoreVector != number (" <<scv.size() << "!=" <<m_numScoreComponent<<") of score components on line " << line_num; UserMessage::Add(strme.str()); abort(); } size_t consumed = 3; if (pipes) { targetPhrase->SetAlignmentInfo(*pipes++); ++consumed; } ScoreComponentCollection sparse; if (pipes) pipes++; //counts if (pipes) { //sparse features SparsePhraseDictionaryFeature* spdf = GetFeature()->GetSparsePhraseDictionaryFeature(); if (spdf) { sparse.Assign(spdf,(pipes++)->as_string()); } } // scv good to go sir! targetPhrase->SetScore(m_feature, scv, sparse, weight, weightWP, languageModels); // Check number of entries delimited by ||| agrees across all lines. for (; pipes; ++pipes, ++consumed) {} if (numElement != consumed) { if (numElement == NOT_FOUND) { numElement = consumed; } else { stringstream strme; strme << "Syntax error at " << filePath << ":" << line_num; UserMessage::Add(strme.str()); abort(); } } //TODO: Would be better to reuse source phrases, but ownership has to be //consistent across phrase table implementations sourcePhrase.Clear(); sourcePhrase.CreateFromString(input, sourcePhraseString, factorDelimiter); //Now that the source phrase is ready, we give the target phrase a copy targetPhrase->SetSourcePhrase(sourcePhrase); if (preSourceString == sourcePhraseString && preSourceNode) { preSourceNode->Add(targetPhrase.release()); } else { preSourceNode = CreateTargetPhraseCollection(sourcePhrase); preSourceNode->Add(targetPhrase.release()); preSourceString.assign(sourcePhraseString.data(), sourcePhraseString.size()); } } // sort each target phrase collection m_collection.Sort(m_tableLimit); /* // TODO ASK OLIVER WHY THIS IS NEEDED const_cast<LMList&>(languageModels).CleanUpAfterSentenceProcessing(); */ return true; }