TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); typedef std::pair<size_t, size_t> AlignPointSizeT; std::vector<int> sourceWords; if(m_coding == REnc) { for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); std::vector<float> scores; std::set<AlignPointSizeT> alignment; enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; size_t srcSize = sourcePhrase.GetSize(); TargetPhrase* targetPhrase = NULL; while(encodedBitStream.TellFromEnd()) { if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase(Output)); targetPhrase = &tpv->back(); targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); state = Symbol; } if(state == Symbol) { unsigned symbol = m_symbolTree->Read(encodedBitStream); if(symbol == phraseStopSymbol) { state = Score; } else { if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); } else if(m_coding == PREnc) { // if the symbol is just a word if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); // false positive consistency check if(m_maxRank && rank > m_maxRank) return TargetPhraseVectorPtr(); // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; // if range smaller than source phrase retrieve subphrase if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); } // false positive consistency check if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); it != subTp.GetAlignmentInfo().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); } else return TargetPhraseVectorPtr(); } } else { Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(symbol), false); targetPhrase->AddWord(word); } } } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); if(scores.size() == m_numScoreComponent) { targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels); if(m_containsAlignmentInfo) state = Alignment; else state = Add; } } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); if(alignPoint == alignStopSymbol) { state = Add; } else { if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) targetPhrase->SetAlignmentInfo(alignment); if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } if(encodedBitStream.TellFromEnd() <= 8) break; state = New; } } if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } return tpv; }
OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { size_t scoreInd = 0; // MAIN LOOP size_t stage = 0; /* 0 = source phrase 1 = target phrase 2 = scores 3 = align 4 = count */ char *tok = strtok (line," "); OnDiskPt::PhrasePtr out(new Phrase()); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1); if (w != NULL) out->AddWord(w); break; } case 1: { Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0); break; } case 2: { float score = Moses::Scan<float>(tok); targetPhrase.SetScore(score, scoreInd); ++scoreInd; break; } case 3: { //targetPhrase.Create1AlignFromString(tok); targetPhrase.CreateAlignFromString(tok); break; } case 4: ++stage; break; /* case 5: { // count info. Only store the 2nd one float val = Moses::Scan<float>(tok); misc[0] = val; ++stage; break; }*/ case 5: { // count info. Only store the 2nd one //float val = Moses::Scan<float>(tok); //misc[0] = val; ++stage; break; } case 6: { // store only the 3rd one (rule count) float val = Moses::Scan<float>(tok); misc[0] = val; ++stage; break; } default: cerr << "ERROR in line " << line << endl; assert(false); break; } } tok = strtok (NULL, " "); } // while (tok != NULL) assert(scoreInd == numScores); targetPhrase.SortAlign(); return out; } // Tokenize()
void Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, char *line, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { size_t scoreInd = 0; // MAIN LOOP size_t stage = 0; /* 0 = source phrase 1 = target phrase 2 = scores 3 = align 4 = count */ char *tok = strtok (line," "); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); break; } case 1: { Tokenize(targetPhrase, tok, false, true, onDiskWrapper); break; } case 2: { float score = Moses::Scan<float>(tok); targetPhrase.SetScore(score, scoreInd); ++scoreInd; break; } case 3: { targetPhrase.Create1AlignFromString(tok); break; } case 4: ++stage; break; case 5: { // count info. Only store the 2nd one float val = Moses::Scan<float>(tok); misc[0] = val; ++stage; break; } default: assert(false); break; } } tok = strtok (NULL, " "); } // while (tok != NULL) assert(scoreInd == numScores); targetPhrase.SortAlign(); } // Tokenize()
OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhrase, const std::string &lineStr, OnDiskWrapper &onDiskWrapper, int numScores, vector<float> &misc) { char line[lineStr.size() + 1]; strcpy(line, lineStr.c_str()); stringstream sparseFeatures, property; size_t scoreInd = 0; // MAIN LOOP size_t stage = 0; /* 0 = source phrase 1 = target phrase 2 = scores 3 = align 4 = count 7 = properties */ char *tok = strtok (line," "); OnDiskPt::PhrasePtr out(new Phrase()); while (tok != NULL) { if (0 == strcmp(tok, "|||")) { ++stage; } else { switch (stage) { case 0: { WordPtr w = Tokenize(sourcePhrase, tok, true, true, onDiskWrapper, 1); if (w != NULL) out->AddWord(w); break; } case 1: { Tokenize(targetPhrase, tok, false, true, onDiskWrapper, 0); break; } case 2: { float score = Moses::Scan<float>(tok); targetPhrase.SetScore(score, scoreInd); ++scoreInd; break; } case 3: { //targetPhrase.Create1AlignFromString(tok); targetPhrase.CreateAlignFromString(tok); break; } case 4: { // store only the 3rd one (rule count) float val = Moses::Scan<float>(tok); misc[0] = val; break; } case 5: { // sparse features sparseFeatures << tok << " "; break; } case 6: { property << tok << " "; break; } default: cerr << "ERROR in line " << line << endl; assert(false); break; } } tok = strtok (NULL, " "); } // while (tok != NULL) assert(scoreInd == numScores); targetPhrase.SetSparseFeatures(Moses::Trim(sparseFeatures.str())); targetPhrase.SetProperty(Moses::Trim(property.str())); targetPhrase.SortAlign(); return out; } // Tokenize()
const TargetPhraseCollection* PhraseDictionaryInterpolated::GetTargetPhraseCollection(const Phrase& src) const { delete m_targetPhrases; m_targetPhrases = new TargetPhraseCollection(); PhraseSet allPhrases; vector<PhraseSet> phrasesByTable(m_dictionaries.size()); for (size_t i = 0; i < m_dictionaries.size(); ++i) { const TargetPhraseCollection* phrases = m_dictionaries[i]->GetTargetPhraseCollection(src); if (phrases) { for (TargetPhraseCollection::const_iterator j = phrases->begin(); j != phrases->end(); ++j) { allPhrases.insert(*j); phrasesByTable[i].insert(*j); } } } ScoreComponentCollection sparseVector; for (PhraseSet::const_iterator i = allPhrases.begin(); i != allPhrases.end(); ++i) { TargetPhrase* combinedPhrase = new TargetPhrase((Phrase)**i); //combinedPhrase->ResetScore(); //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; combinedPhrase->SetSourcePhrase((*i)->GetSourcePhrase()); combinedPhrase->SetAlignTerm(&((*i)->GetAlignTerm())); combinedPhrase->SetAlignNonTerm(&((*i)->GetAlignTerm())); Scores combinedScores(GetFeature()->GetNumScoreComponents()); for (size_t j = 0; j < phrasesByTable.size(); ++j) { PhraseSet::const_iterator tablePhrase = phrasesByTable[j].find(combinedPhrase); if (tablePhrase != phrasesByTable[j].end()) { Scores tableScores = (*tablePhrase)->GetScoreBreakdown() .GetScoresForProducer(GetFeature()); //cerr << "Scores from " << j << " table: "; for (size_t k = 0; k < tableScores.size()-1; ++k) { //cerr << tableScores[k] << "(" << exp(tableScores[k]) << ") "; combinedScores[k] += m_weights[k][j] * exp(tableScores[k]); //cerr << m_weights[k][j] * exp(tableScores[k]) << " "; } //cerr << endl; } } //map back to log space //cerr << "Combined "; for (size_t k = 0; k < combinedScores.size()-1; ++k) { //cerr << combinedScores[k] << " "; combinedScores[k] = log(combinedScores[k]); //cerr << combinedScores[k] << " "; } //cerr << endl; combinedScores.back() = 1; //assume last is penalty combinedPhrase->SetScore( GetFeature(), combinedScores, sparseVector, m_weightT, m_weightWP, *m_languageModels); //cerr << *combinedPhrase << " " << combinedPhrase->GetScoreBreakdown() << endl; m_targetPhrases->Add(combinedPhrase); } m_targetPhrases->Prune(true,m_tableLimit); return m_targetPhrases; }