void PhraseTableCreator::EncodeTargetPhraseREnc(std::vector<std::string>& s, std::vector<std::string>& t, std::set<AlignPoint>& a, std::ostream& os) { std::stringstream encodedTargetPhrase; std::vector<std::vector<size_t> > a2(t.size()); for(std::set<AlignPoint>::iterator it = a.begin(); it != a.end(); it++) a2[it->second].push_back(it->first); for(size_t i = 0; i < t.size(); i++) { unsigned idxTarget = GetOrAddTargetSymbolId(t[i]); unsigned encodedSymbol = -1; unsigned bestSrcPos = s.size(); unsigned bestDiff = s.size(); unsigned bestRank = m_lexicalTable.size(); unsigned badRank = m_lexicalTable.size(); for(std::vector<size_t>::iterator it = a2[i].begin(); it != a2[i].end(); it++) { unsigned idxSource = GetSourceSymbolId(s[*it]); size_t r = GetRank(idxSource, idxTarget); if(r != badRank) { if(r < bestRank) { bestRank = r; bestSrcPos = *it; bestDiff = abs(*it-i); } else if(r == bestRank && unsigned(abs(*it-i)) < bestDiff) { bestSrcPos = *it; bestDiff = abs(*it-i); } } } if(bestRank != badRank && bestSrcPos < s.size()) { if(bestSrcPos == i) encodedSymbol = EncodeREncSymbol3(bestRank); else encodedSymbol = EncodeREncSymbol2(bestSrcPos, bestRank); a.erase(AlignPoint(bestSrcPos, i)); } else { encodedSymbol = EncodeREncSymbol1(idxTarget); } os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); m_symbolCounter.Increase(encodedSymbol); } unsigned stopSymbolId = GetTargetSymbolId(m_phraseStopSymbol); unsigned encodedSymbol = EncodeREncSymbol1(stopSymbolId); os.write((char*)&encodedSymbol, sizeof(encodedSymbol)); m_symbolCounter.Increase(encodedSymbol); }
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); typedef std::pair<size_t, size_t> AlignPointSizeT; std::vector<int> sourceWords; if(m_coding == REnc) { for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); std::vector<float> scores; std::set<AlignPointSizeT> alignment; enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; size_t srcSize = sourcePhrase.GetSize(); TargetPhrase* targetPhrase = NULL; while(encodedBitStream.TellFromEnd()) { if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase(Output)); targetPhrase = &tpv->back(); targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); state = Symbol; } if(state == Symbol) { unsigned symbol = m_symbolTree->Read(encodedBitStream); if(symbol == phraseStopSymbol) { state = Score; } else { if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); } else if(m_coding == PREnc) { // if the symbol is just a word if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); // false positive consistency check if(m_maxRank && rank > m_maxRank) return TargetPhraseVectorPtr(); // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; // if range smaller than source phrase retrieve subphrase if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); } // false positive consistency check if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); it != subTp.GetAlignmentInfo().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); } else return TargetPhraseVectorPtr(); } } else { Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(symbol), false); targetPhrase->AddWord(word); } } } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); if(scores.size() == m_numScoreComponent) { targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels); if(m_containsAlignmentInfo) state = Alignment; else state = Add; } } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); if(alignPoint == alignStopSymbol) { state = Add; } else { if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) targetPhrase->SetAlignmentInfo(alignment); if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } if(encodedBitStream.TellFromEnd() <= 8) break; state = New; } } if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } return tpv; }
void PhraseTableCreator::LoadLexicalTable(std::string filePath) { std::vector<SrcTrgProb> t_lexTable; std::cerr << "Reading in lexical table for Rank Encoding" << std::endl; std::ifstream lexIn(filePath.c_str(), std::ifstream::in); std::string src, trg; float prob; // Reading in the translation probability lexicon std::cerr << "\tLoading from " << filePath << std::endl; while(lexIn >> trg >> src >> prob) { t_lexTable.push_back(SrcTrgProb(SrcTrgString(src, trg), prob)); AddSourceSymbolId(src); AddTargetSymbolId(trg); } // Sorting lexicon by source words by lexicographical order, corresponding // target words by decreasing probability. std::cerr << "\tSorting according to translation rank" << std::endl; std::sort(t_lexTable.begin(), t_lexTable.end(), SrcTrgProbSorter()); // Re-assigning source word ids in lexicographical order std::vector<std::string> temp1; temp1.resize(m_sourceSymbolsMap.size()); for(boost::unordered_map<std::string, unsigned>::iterator it = m_sourceSymbolsMap.begin(); it != m_sourceSymbolsMap.end(); it++) temp1[it->second] = it->first; std::sort(temp1.begin(), temp1.end()); for(size_t i = 0; i < temp1.size(); i++) m_sourceSymbolsMap[temp1[i]] = i; // Building the lexicon based on source and target word ids std::string srcWord = ""; size_t srcIdx = 0; for(std::vector<SrcTrgProb>::iterator it = t_lexTable.begin(); it != t_lexTable.end(); it++) { // If we encounter a new source word if(it->first.first != srcWord) { srcIdx = GetSourceSymbolId(it->first.first); // Store position of first translation if(srcIdx >= m_lexicalTableIndex.size()) m_lexicalTableIndex.resize(srcIdx + 1); m_lexicalTableIndex[srcIdx] = m_lexicalTable.size(); } // Store pair of source word and target word size_t trgIdx = GetTargetSymbolId(it->first.second); m_lexicalTable.push_back(SrcTrg(srcIdx, trgIdx)); srcWord = it->first.first; } std::cerr << "\tLoaded " << m_lexicalTable.size() << " lexical pairs" << std::endl; std::cerr << std::endl; }