std::vector<TargetPhrase*> PhraseDictionaryTransliteration::CreateTargetPhrases(const Phrase &sourcePhrase, const string &outDir) const { std::vector<TargetPhrase*> ret; string outPath = outDir + "/out.txt"; ifstream outStream(outPath.c_str()); string line; while (getline(outStream, line)) { vector<string> toks; Tokenize(toks, line, "\t"); UTIL_THROW_IF2(toks.size() != 2, "Error in transliteration output file. Expecting word\tscore"); TargetPhrase *tp = new TargetPhrase(); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, toks[0], false); float score = Scan<float>(toks[1]); tp->GetScoreBreakdown().PlusEquals(this, score); // score of all other ff when this rule is being loaded tp->Evaluate(sourcePhrase, GetFeaturesToApply()); ret.push_back(tp); } outStream.close(); return ret; }
TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase) const { TargetPhrase* targetPhrase = new TargetPhrase(Output); for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words Word& word = m_vocab->GetWord( phrase.words[i]); assert(word != m_vocab->GetkOOVWord()); targetPhrase->AddWord(word); } // scoring return targetPhrase; }
TargetPhrase* BilingualDynSuffixArray:: GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const { TargetPhrase* targetPhrase = new TargetPhrase(); for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words Word& word = m_trgVocab->GetWord( phrase.words[i]); CHECK(word != m_trgVocab->GetkOOVWord()); targetPhrase->AddWord(word); } targetPhrase->SetSourcePhrase(sourcePhrase); // scoring return targetPhrase; }
TargetPhrase* BilingualDynSuffixArray:: GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const { TargetPhrase* targetPhrase = new TargetPhrase(); for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words Word& word = m_trgVocab->GetWord( phrase.words[i]); UTIL_THROW_IF2(word == m_trgVocab->GetkOOVWord(), "Unknown word at position " << i); targetPhrase->AddWord(word); } // scoring return targetPhrase; }
TargetPhrase *SkeletonPT::CreateTargetPhrase(const Phrase &sourcePhrase) const { // create a target phrase from the 1st word of the source, prefix with 'SkeletonPT:' CHECK(sourcePhrase.GetSize()); CHECK(m_output.size() == 1); string str = sourcePhrase.GetWord(0).GetFactor(0)->GetString().as_string(); str = "SkeletonPT:" + str; TargetPhrase *tp = new TargetPhrase(); Word &word = tp->AddWord(); word.CreateFromString(Output, m_output, str, false); // score for this phrase table vector<float> scores(m_numScoreComponents, 1.3); tp->GetScoreBreakdown().PlusEquals(this, scores); // score of all other ff when this rule is being loaded tp->Evaluate(sourcePhrase, GetFeaturesToApply()); return tp; }
TargetPhraseVectorPtr PhraseDecoder::DecodeCollection( TargetPhraseVectorPtr tpv, BitWrapper<> &encodedBitStream, const Phrase &sourcePhrase, bool topLevel) { bool extending = tpv->size(); size_t bitsLeft = encodedBitStream.TellFromEnd(); typedef std::pair<size_t, size_t> AlignPointSizeT; std::vector<int> sourceWords; if(m_coding == REnc) { for(size_t i = 0; i < sourcePhrase.GetSize(); i++) { std::string sourceWord = sourcePhrase.GetWord(i).GetString(*m_input, false); unsigned idx = GetSourceSymbolId(sourceWord); sourceWords.push_back(idx); } } unsigned phraseStopSymbol = 0; AlignPoint alignStopSymbol(-1, -1); std::vector<float> scores; std::set<AlignPointSizeT> alignment; enum DecodeState { New, Symbol, Score, Alignment, Add } state = New; size_t srcSize = sourcePhrase.GetSize(); TargetPhrase* targetPhrase = NULL; while(encodedBitStream.TellFromEnd()) { if(state == New) { // Creating new TargetPhrase on the heap tpv->push_back(TargetPhrase(Output)); targetPhrase = &tpv->back(); targetPhrase->SetSourcePhrase(sourcePhrase); alignment.clear(); scores.clear(); state = Symbol; } if(state == Symbol) { unsigned symbol = m_symbolTree->Read(encodedBitStream); if(symbol == phraseStopSymbol) { state = Score; } else { if(m_coding == REnc) { std::string wordString; size_t type = GetREncType(symbol); if(type == 1) { unsigned decodedSymbol = DecodeREncSymbol1(symbol); wordString = GetTargetSymbol(decodedSymbol); } else if (type == 2) { size_t rank = DecodeREncSymbol2Rank(symbol); size_t srcPos = DecodeREncSymbol2Position(symbol); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = targetPhrase->GetSize(); alignment.insert(AlignPoint(srcPos, trgPos)); } } else if(type == 3) { size_t rank = DecodeREncSymbol3(symbol); size_t srcPos = targetPhrase->GetSize(); if(srcPos >= sourceWords.size()) return TargetPhraseVectorPtr(); wordString = GetTargetSymbol(GetTranslation(sourceWords[srcPos], rank)); if(m_phraseDictionary.m_useAlignmentInfo) { size_t trgPos = srcPos; alignment.insert(AlignPoint(srcPos, trgPos)); } } Word word; word.CreateFromString(Output, *m_output, wordString, false); targetPhrase->AddWord(word); } else if(m_coding == PREnc) { // if the symbol is just a word if(GetPREncType(symbol) == 1) { unsigned decodedSymbol = DecodePREncSymbol1(symbol); Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(decodedSymbol), false); targetPhrase->AddWord(word); } // if the symbol is a subphrase pointer else { int left = DecodePREncSymbol2Left(symbol); int right = DecodePREncSymbol2Right(symbol); unsigned rank = DecodePREncSymbol2Rank(symbol); int srcStart = left + targetPhrase->GetSize(); int srcEnd = srcSize - right - 1; // false positive consistency check if(0 > srcStart || srcStart > srcEnd || unsigned(srcEnd) >= srcSize) return TargetPhraseVectorPtr(); // false positive consistency check if(m_maxRank && rank > m_maxRank) return TargetPhraseVectorPtr(); // set subphrase by default to itself TargetPhraseVectorPtr subTpv = tpv; // if range smaller than source phrase retrieve subphrase if(unsigned(srcEnd - srcStart + 1) != srcSize) { Phrase subPhrase = sourcePhrase.GetSubString(WordsRange(srcStart, srcEnd)); subTpv = CreateTargetPhraseCollection(subPhrase, false); } // false positive consistency check if(subTpv != NULL && rank < subTpv->size()) { // insert the subphrase into the main target phrase TargetPhrase& subTp = subTpv->at(rank); if(m_phraseDictionary.m_useAlignmentInfo) { // reconstruct the alignment data based on the alignment of the subphrase for(AlignmentInfo::const_iterator it = subTp.GetAlignmentInfo().begin(); it != subTp.GetAlignmentInfo().end(); it++) { alignment.insert(AlignPointSizeT(srcStart + it->first, targetPhrase->GetSize() + it->second)); } } targetPhrase->Append(subTp); } else return TargetPhraseVectorPtr(); } } else { Word word; word.CreateFromString(Output, *m_output, GetTargetSymbol(symbol), false); targetPhrase->AddWord(word); } } } else if(state == Score) { size_t idx = m_multipleScoreTrees ? scores.size() : 0; float score = m_scoreTrees[idx]->Read(encodedBitStream); scores.push_back(score); if(scores.size() == m_numScoreComponent) { targetPhrase->SetScore(m_feature, scores, ScoreComponentCollection() /*sparse*/,*m_weight, m_weightWP, *m_languageModels); if(m_containsAlignmentInfo) state = Alignment; else state = Add; } } else if(state == Alignment) { AlignPoint alignPoint = m_alignTree->Read(encodedBitStream); if(alignPoint == alignStopSymbol) { state = Add; } else { if(m_phraseDictionary.m_useAlignmentInfo) alignment.insert(AlignPointSizeT(alignPoint)); } } if(state == Add) { if(m_phraseDictionary.m_useAlignmentInfo) targetPhrase->SetAlignmentInfo(alignment); if(m_coding == PREnc) { if(!m_maxRank || tpv->size() <= m_maxRank) bitsLeft = encodedBitStream.TellFromEnd(); if(!topLevel && m_maxRank && tpv->size() >= m_maxRank) break; } if(encodedBitStream.TellFromEnd() <= 8) break; state = New; } } if(m_coding == PREnc && !extending) { bitsLeft = bitsLeft > 8 ? bitsLeft : 0; m_decodingCache.Cache(sourcePhrase, tpv, bitsLeft, m_maxRank); } return tpv; }
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) { // unknown word, add as trans opt const StaticData &staticData = StaticData::Instance(); const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); size_t isDigit = 0; if (staticData.GetDropUnknown()) { const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface const StringPiece s = f->GetString(); isDigit = s.find_first_of("0123456789"); if (isDigit == string::npos) isDigit = 0; else isDigit = 1; // modify the starting bitmap } Phrase* unksrc = new Phrase(1); unksrc->AddWord() = sourceWord; Word &newWord = unksrc->GetWord(0); newWord.SetIsOOV(true); m_unksrcs.push_back(unksrc); //TranslationOption *transOpt; if (! staticData.GetDropUnknown() || isDigit) { // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; float prob = iterLHS->second; // lhs //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal(); Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); // add to dictionary TargetPhrase *targetPhrase = new TargetPhrase(); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); // scores float unknownScore = FloorScore(TransformScore(prob)); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } // for (iterLHS } else { // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits<float>::infinity()); TargetPhrase *targetPhrase = new TargetPhrase(); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; //float prob = iterLHS->second; Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } } }