/** TODO: this method isn't used anywhere. Remove? */ void ChartHypothesis::GetOutputPhrase(size_t leftRightMost, size_t numWords, Phrase &outPhrase) const { const TargetPhrase &tp = GetCurrTargetPhrase(); size_t targetSize = tp.GetSize(); for (size_t i = 0; i < targetSize; ++i) { size_t pos; if (leftRightMost == 1) { pos = i; } else if (leftRightMost == 2) { pos = targetSize - i - 1; } else { abort(); } const Word &word = tp.GetWord(pos); if (word.IsNonTerminal()) { // non-term. fill out with prev hypo size_t nonTermInd = tp.GetAlignNonTerm().GetNonTermIndexMap()[pos]; const ChartHypothesis *prevHypo = m_prevHypos[nonTermInd]; prevHypo->GetOutputPhrase(outPhrase); } else { outPhrase.AddWord(word); } if (outPhrase.GetSize() >= numWords) { return; } } }
void OnDiskQuery::Tokenize(Phrase &phrase, const std::string &token, bool addSourceNonTerm, bool addTargetNonTerm) { bool nonTerm = false; size_t tokSize = token.size(); int comStr =token.compare(0, 1, "["); if (comStr == 0) { comStr = token.compare(tokSize - 1, 1, "]"); nonTerm = comStr == 0; } if (nonTerm) { // non-term size_t splitPos = token.find_first_of("[", 2); std::string wordStr = token.substr(0, splitPos); if (splitPos == std::string::npos) { // lhs - only 1 word WordPtr word (new Word()); word->CreateFromString(wordStr, m_wrapper.GetVocab()); phrase.AddWord(word); } else { // source & target non-terms if (addSourceNonTerm) { WordPtr word( new Word()); word->CreateFromString(wordStr, m_wrapper.GetVocab()); phrase.AddWord(word); } wordStr = token.substr(splitPos, tokSize - splitPos); if (addTargetNonTerm) { WordPtr word(new Word()); word->CreateFromString(wordStr, m_wrapper.GetVocab()); phrase.AddWord(word); } } } else { // term WordPtr word(new Word()); word->CreateFromString(token, m_wrapper.GetVocab()); phrase.AddWord(word); } }
/** constructor; just initialize the base class */ TranslationOptionCollectionLattice::TranslationOptionCollectionLattice( const WordLattice &input , size_t maxNoTransOptPerCoverage, float translationOptionThreshold) : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) { UTIL_THROW_IF2(StaticData::Instance().GetUseLegacyPT(), "Not for models using the legqacy binary phrase table"); const InputFeature *inputFeature = StaticData::Instance().GetInputFeature(); UTIL_THROW_IF2(inputFeature == NULL, "Input feature must be specified"); size_t maxPhraseLength = StaticData::Instance().GetMaxPhraseLength(); size_t size = input.GetSize(); // 1-word phrases for (size_t startPos = 0; startPos < size; ++startPos) { const std::vector<size_t> &nextNodes = input.GetNextNodes(startPos); WordsRange range(startPos, startPos); const NonTerminalSet &labels = input.GetLabelSet(startPos, startPos); const ConfusionNet::Column &col = input.GetColumn(startPos); for (size_t i = 0; i < col.size(); ++i) { const Word &word = col[i].first; UTIL_THROW_IF2(word.IsEpsilon(), "Epsilon not supported"); Phrase subphrase; subphrase.AddWord(word); const ScorePair &scores = col[i].second; ScorePair *inputScore = new ScorePair(scores); InputPath *path = new InputPath(subphrase, labels, range, NULL, inputScore); size_t nextNode = nextNodes[i]; path->SetNextNode(nextNode); m_inputPathQueue.push_back(path); } } // iteratively extend all paths for (size_t endPos = 1; endPos < size; ++endPos) { const std::vector<size_t> &nextNodes = input.GetNextNodes(endPos); // loop thru every previous paths size_t numPrevPaths = m_inputPathQueue.size(); for (size_t i = 0; i < numPrevPaths; ++i) { //for (size_t pathInd = 0; pathInd < prevPaths.size(); ++pathInd) { const InputPath &prevPath = *m_inputPathQueue[i]; size_t nextNode = prevPath.GetNextNode(); if (prevPath.GetWordsRange().GetEndPos() + nextNode != endPos) { continue; } size_t startPos = prevPath.GetWordsRange().GetStartPos(); if (endPos - startPos + 1 > maxPhraseLength) { continue; } WordsRange range(startPos, endPos); const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos); const Phrase &prevPhrase = prevPath.GetPhrase(); const ScorePair *prevInputScore = prevPath.GetInputScore(); UTIL_THROW_IF2(prevInputScore == NULL, "Null previous score"); // loop thru every word at this position const ConfusionNet::Column &col = input.GetColumn(endPos); for (size_t i = 0; i < col.size(); ++i) { const Word &word = col[i].first; Phrase subphrase(prevPhrase); subphrase.AddWord(word); const ScorePair &scores = col[i].second; ScorePair *inputScore = new ScorePair(*prevInputScore); inputScore->PlusEquals(scores); InputPath *path = new InputPath(subphrase, labels, range, &prevPath, inputScore); size_t nextNode = nextNodes[i]; path->SetNextNode(nextNode); m_inputPathQueue.push_back(path); } // for (size_t i = 0; i < col.size(); ++i) { } // for (size_t i = 0; i < numPrevPaths; ++i) { } }
/** constructor; just initialize the base class */ TranslationOptionCollectionConfusionNet::TranslationOptionCollectionConfusionNet( const ConfusionNet &input , size_t maxNoTransOptPerCoverage, float translationOptionThreshold) : TranslationOptionCollection(input, maxNoTransOptPerCoverage, translationOptionThreshold) { const InputFeature *inputFeature = StaticData::Instance().GetInputFeature(); CHECK(inputFeature); size_t inputSize = input.GetSize(); m_inputPathMatrix.resize(inputSize); size_t maxSizePhrase = StaticData::Instance().GetMaxPhraseLength(); maxSizePhrase = std::min(inputSize, maxSizePhrase); // 1-word phrases for (size_t startPos = 0; startPos < inputSize; ++startPos) { vector<InputPathList> &vec = m_inputPathMatrix[startPos]; vec.push_back(InputPathList()); InputPathList &list = vec.back(); WordsRange range(startPos, startPos); const NonTerminalSet &labels = input.GetLabelSet(startPos, startPos); const ConfusionNet::Column &col = input.GetColumn(startPos); for (size_t i = 0; i < col.size(); ++i) { const Word &word = col[i].first; Phrase subphrase; subphrase.AddWord(word); const ScorePair &scores = col[i].second; ScorePair *inputScore = new ScorePair(scores); InputPath *path = new InputPath(subphrase, labels, range, NULL, inputScore); list.push_back(path); m_inputPathQueue.push_back(path); } } // subphrases of 2+ words for (size_t phraseSize = 2; phraseSize <= maxSizePhrase; ++phraseSize) { for (size_t startPos = 0; startPos < inputSize - phraseSize + 1; ++startPos) { size_t endPos = startPos + phraseSize -1; WordsRange range(startPos, endPos); const NonTerminalSet &labels = input.GetLabelSet(startPos, endPos); vector<InputPathList> &vec = m_inputPathMatrix[startPos]; vec.push_back(InputPathList()); InputPathList &list = vec.back(); // loop thru every previous path const InputPathList &prevPaths = GetInputPathList(startPos, endPos - 1); int prevNodesInd = 0; InputPathList::const_iterator iterPath; for (iterPath = prevPaths.begin(); iterPath != prevPaths.end(); ++iterPath) { //for (size_t pathInd = 0; pathInd < prevPaths.size(); ++pathInd) { const InputPath &prevPath = **iterPath; //const InputPath &prevPath = *prevPaths[pathInd]; const Phrase &prevPhrase = prevPath.GetPhrase(); const ScorePair *prevInputScore = prevPath.GetInputScore(); CHECK(prevInputScore); // loop thru every word at this position const ConfusionNet::Column &col = input.GetColumn(endPos); for (size_t i = 0; i < col.size(); ++i) { const Word &word = col[i].first; Phrase subphrase(prevPhrase); subphrase.AddWord(word); const ScorePair &scores = col[i].second; ScorePair *inputScore = new ScorePair(*prevInputScore); inputScore->PlusEquals(scores); InputPath *path = new InputPath(subphrase, labels, range, &prevPath, inputScore); list.push_back(path); m_inputPathQueue.push_back(path); } // for (size_t i = 0; i < col.size(); ++i) { ++prevNodesInd; } // for (iterPath = prevPaths.begin(); iterPath != prevPaths.end(); ++iterPath) { } } }
void ChartParserUnknown::Process(const Word &sourceWord, const WordsRange &range, ChartParserCallback &to) { // unknown word, add as trans opt const StaticData &staticData = StaticData::Instance(); const UnknownWordPenaltyProducer &unknownWordPenaltyProducer = UnknownWordPenaltyProducer::Instance(); size_t isDigit = 0; if (staticData.GetDropUnknown()) { const Factor *f = sourceWord[0]; // TODO hack. shouldn't know which factor is surface const StringPiece s = f->GetString(); isDigit = s.find_first_of("0123456789"); if (isDigit == string::npos) isDigit = 0; else isDigit = 1; // modify the starting bitmap } Phrase* unksrc = new Phrase(1); unksrc->AddWord() = sourceWord; Word &newWord = unksrc->GetWord(0); newWord.SetIsOOV(true); m_unksrcs.push_back(unksrc); //TranslationOption *transOpt; if (! staticData.GetDropUnknown() || isDigit) { // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; float prob = iterLHS->second; // lhs //const Word &sourceLHS = staticData.GetInputDefaultNonTerminal(); Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); // add to dictionary TargetPhrase *targetPhrase = new TargetPhrase(); Word &targetWord = targetPhrase->AddWord(); targetWord.CreateUnknownWord(sourceWord); // scores float unknownScore = FloorScore(TransformScore(prob)); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); targetPhrase->SetAlignmentInfo("0-0"); if (staticData.IsDetailedTreeFragmentsTranslationReportingEnabled()) { targetPhrase->SetProperty("Tree","[ " + (*targetLHS)[0]->GetString().as_string() + " "+sourceWord[0]->GetString().as_string()+" ]"); } // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } // for (iterLHS } else { // drop source word. create blank trans opt float unknownScore = FloorScore(-numeric_limits<float>::infinity()); TargetPhrase *targetPhrase = new TargetPhrase(); // loop const UnknownLHSList &lhsList = staticData.GetUnknownLHS(); UnknownLHSList::const_iterator iterLHS; for (iterLHS = lhsList.begin(); iterLHS != lhsList.end(); ++iterLHS) { const string &targetLHSStr = iterLHS->first; //float prob = iterLHS->second; Word *targetLHS = new Word(true); targetLHS->CreateFromString(Output, staticData.GetOutputFactorOrder(), targetLHSStr, true); UTIL_THROW_IF2(targetLHS->GetFactor(0) == NULL, "Null factor for target LHS"); targetPhrase->GetScoreBreakdown().Assign(&unknownWordPenaltyProducer, unknownScore); targetPhrase->Evaluate(*unksrc); targetPhrase->SetTargetLHS(targetLHS); // chart rule to.AddPhraseOOV(*targetPhrase, m_cacheTargetPhraseCollection, range); } } }