void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule) { // print alignment of words for(int ti=startT; ti<=endT; ti++) { WordIndex::const_iterator p = indexT.find(ti); if (p != indexT.end()) { // does word still exist? for(unsigned int i=0; i<m_sentence.alignedToT[ti].size(); i++) { int si = m_sentence.alignedToT[ti][i]; std::string sourceSymbolIndex = IntToString(indexS.find(si)->second); std::string targetSymbolIndex = IntToString(p->second); rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " "; if (! m_options.onlyDirectFlag) rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " "; } } } // print alignment of non terminals HoleList::const_iterator iterHole; for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) { const Hole &hole = *iterHole; std::string sourceSymbolIndex = IntToString(hole.GetPos(0)); std::string targetSymbolIndex = IntToString(hole.GetPos(1)); rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " "; if (!m_options.onlyDirectFlag) rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " "; } rule.alignment.erase(rule.alignment.size()-1); if (!m_options.onlyDirectFlag) { rule.alignmentInv.erase(rule.alignmentInv.size()-1); } }
void ExtractTask::saveTargetSyntacticPreference( const HoleCollection &holeColl, const LabelIndex &labelIndex, ExtractedRule &rule) { rule.targetSyntacticPreference = ""; int holeCount = 0; for (HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin(); iterHoleList != holeColl.GetHoles().end(); ++iterHoleList) { const Hole &hole = *iterHoleList; int labelI = labelIndex[ 2+holeCount ]; string targetLabel = "X"; int startT = hole.GetStart(1); int endT = hole.GetEnd(1); if (m_sentence.targetTree.HasNode(startT,endT)) { rule.targetSyntacticPreference += m_sentence.targetTree.GetNodes(startT,endT)[labelI]->label; rule.targetSyntacticPreference += " "; } else { rule.targetSyntacticPreference += "X "; } ++holeCount; } rule.targetSyntacticPreference.erase(rule.targetSyntacticPreference.size()-1); }
string ExtractTask::saveTargetHieroPhrase( int startT, int endT, int startS, int endS , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore , int countS) { HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); assert(iterHoleList != holeColl.GetHoles().end()); string out = ""; int outPos = 0; int holeCount = 0; for(int currPos = startT; currPos <= endT; currPos++) { bool isHole = false; if (iterHoleList != holeColl.GetHoles().end()) { const Hole &hole = *iterHoleList; isHole = hole.GetStart(1) == currPos; } if (isHole) { Hole &hole = *iterHoleList; const string &sourceLabel = hole.GetLabel(0); assert(sourceLabel != ""); int labelI = labelIndex[ 2+holeCount ]; string targetLabel; if (m_options.targetSyntax && !m_options.targetSyntacticPreferences) { targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->label; } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { targetLabel = "X"; } hole.SetLabel(targetLabel, 1); if (m_options.unpairedExtractFormat) { out += "[" + targetLabel + "] "; } else { out += "[" + sourceLabel + "][" + targetLabel + "] "; } if (m_options.pcfgScore) { logPCFGScore -= getPcfgScore(*m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]); } currPos = hole.GetEnd(1); hole.SetPos(outPos, 1); ++iterHoleList; holeCount++; } else { indexT[currPos] = outPos; out += m_sentence.target[currPos] + " "; } outPos++; } assert(iterHoleList == holeColl.GetHoles().end()); return out.erase(out.size()-1); }
string printTargetHieroPhrase(SentenceAlignmentWithSyntax &sentence , int startT, int endT, int startS, int endS , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore) { HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); assert(iterHoleList != holeColl.GetHoles().end()); bool stringToTree = !options.sourceSyntax && options.targetSyntax; string out = ""; int outPos = 0; int holeCount = 0; for(int currPos = startT; currPos <= endT; currPos++) { bool isHole = false; if (iterHoleList != holeColl.GetHoles().end()) { const Hole &hole = *iterHoleList; isHole = hole.GetStart(1) == currPos; } if (isHole) { Hole &hole = *iterHoleList; const string &sourceLabel = hole.GetLabel(0); assert(sourceLabel != ""); int labelI = labelIndex[ 2+holeCount ]; string targetLabel = options.targetSyntax ? sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X"; hole.SetLabel(targetLabel, 1); if (stringToTree) { out += "[" + targetLabel + "] "; } else { out += "[" + sourceLabel + "][" + targetLabel + "] "; } if (options.pcfgScore) { double score = sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetScore(); logPCFGScore -= score; } currPos = hole.GetEnd(1); hole.SetPos(outPos, 1); ++iterHoleList; holeCount++; } else { indexT[currPos] = outPos; out += sentence.target[currPos] + " "; } outPos++; } assert(iterHoleList == holeColl.GetHoles().end()); return out.erase(out.size()-1); }
void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS) { LabelIndex labelIndex,labelCount; // number of target head labels int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1; if (m_options.targetSyntacticPreferences && !numLabels) { numLabels++; } labelCount.push_back(numLabels); labelIndex.push_back(0); // number of source head labels numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1; labelCount.push_back(numLabels); labelIndex.push_back(0); // number of target hole labels for( HoleList::const_iterator hole = holeColl.GetHoles().begin(); hole != holeColl.GetHoles().end(); hole++ ) { int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ; if (m_options.targetSyntacticPreferences && !numLabels) { numLabels++; } labelCount.push_back(numLabels); labelIndex.push_back(0); } // number of source hole labels holeColl.SortSourceHoles(); for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin(); i != holeColl.GetSortedSourceHoles().end(); i++ ) { const Hole &hole = **i; int numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ; labelCount.push_back(numLabels); labelIndex.push_back(0); } // loop through the holes bool done = false; while(!done) { saveHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS ); for(unsigned int i=0; i<labelIndex.size(); i++) { labelIndex[i]++; if(labelIndex[i] == labelCount[i]) { labelIndex[i] = 0; if (i == labelIndex.size()-1) done = true; } else { break; } } } }
void printAllHieroPhrases( SentenceAlignmentWithSyntax &sentence , int startT, int endT, int startS, int endS , HoleCollection &holeColl) { LabelIndex labelIndex,labelCount; // number of target head labels int numLabels = options.targetSyntax ? sentence.targetTree.GetNodes(startT,endT).size() : 1; labelCount.push_back(numLabels); labelIndex.push_back(0); // number of source head labels numLabels = options.sourceSyntax ? sentence.sourceTree.GetNodes(startS,endS).size() : 1; labelCount.push_back(numLabels); labelIndex.push_back(0); // number of target hole labels for( HoleList::const_iterator hole = holeColl.GetHoles().begin(); hole != holeColl.GetHoles().end(); hole++ ) { int numLabels = options.targetSyntax ? sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ; labelCount.push_back(numLabels); labelIndex.push_back(0); } // number of source hole labels holeColl.SortSourceHoles(); for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin(); i != holeColl.GetSortedSourceHoles().end(); i++ ) { const Hole &hole = **i; int numLabels = options.sourceSyntax ? sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ; labelCount.push_back(numLabels); labelIndex.push_back(0); } // loop through the holes bool done = false; while(!done) { printHieroPhrase( sentence, startT, endT, startS, endS, holeColl, labelIndex ); for(int i=0; i<labelIndex.size(); i++) { labelIndex[i]++; if(labelIndex[i] == labelCount[i]) { labelIndex[i] = 0; if (i == labelIndex.size()-1) done = true; } else { break; } } } }
string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex) { HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); assert(iterHoleList != holeColl.GetHoles().end()); string out = ""; int outPos = 0; int holeCount = 0; for(int currPos = startT; currPos <= endT; currPos++) { bool isHole = false; if (iterHoleList != holeColl.GetHoles().end()) { const Hole &hole = *iterHoleList; isHole = hole.GetStart(1) == currPos; } if (isHole) { Hole &hole = *iterHoleList; const string &sourceLabel = hole.GetLabel(0); assert(sourceLabel != ""); int labelI = labelIndex[ 2+holeCount ]; string targetLabel = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X"; hole.SetLabel(targetLabel, 1); out += "[" + sourceLabel + "][" + targetLabel + "] "; currPos = hole.GetEnd(1); hole.SetPos(outPos, 1); ++iterHoleList; holeCount++; } else { indexT[currPos] = outPos; out += m_sentence->target[currPos] + " "; } outPos++; } assert(iterHoleList == holeColl.GetHoles().end()); return out.erase(out.size()-1); }
void preprocessSourceHieroPhrase( SentenceAlignmentWithSyntax &sentence , int startT, int endT, int startS, int endS , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex) { vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin(); assert(iterHoleList != holeColl.GetSortedSourceHoles().end()); int outPos = 0; int holeCount = 0; int holeTotal = holeColl.GetHoles().size(); for(int currPos = startS; currPos <= endS; currPos++) { bool isHole = false; if (iterHoleList != holeColl.GetSortedSourceHoles().end()) { const Hole &hole = **iterHoleList; isHole = hole.GetStart(0) == currPos; } if (isHole) { Hole &hole = **iterHoleList; int labelI = labelIndex[ 2+holeCount+holeTotal ]; string label = options.sourceSyntax ? sentence.sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X"; hole.SetLabel(label, 0); currPos = hole.GetEnd(0); hole.SetPos(outPos, 0); ++iterHoleList; ++holeCount; } else { indexS[currPos] = outPos; } outPos++; } assert(iterHoleList == holeColl.GetSortedSourceHoles().end()); }
// this function is called recursively // it pokes a new hole into the phrase pair, and then calls itself for more holes void addHieroRule( SentenceAlignmentWithSyntax &sentence , int startT, int endT, int startS, int endS , RuleExist &ruleExist, const HoleCollection &holeColl , int numHoles, int initStartT, int wordCountT, int wordCountS) { // done, if already the maximum number of non-terminals in phrase pair if (numHoles >= options.maxNonTerm) return; // find a hole... for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT) { for (int endHoleT = startHoleT+(options.minHoleTarget-1); endHoleT <= endT; ++endHoleT) { // if last non-terminal, enforce word count limit if (numHoles == options.maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > options.maxSymbolsTarget) continue; // determine the number of remaining target words const int newWordCountT = wordCountT - (endHoleT-startHoleT+1); // always enforce min word count limit if (newWordCountT < options.minWords) continue; // except the whole span if (startHoleT == startT && endHoleT == endT) continue; // does a phrase cover this target span? // if it does, then there should be a list of mapped source phrases // (multiple possible due to unaligned words) const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT); // loop over sub phrase pairs HoleList::const_iterator iterSourceHoles; for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles) { const Hole &sourceHole = *iterSourceHoles; const int sourceHoleSize = sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1; // enforce minimum hole size if (sourceHoleSize < options.minHoleSource) continue; // determine the number of remaining source words const int newWordCountS = wordCountS - sourceHoleSize; // if last non-terminal, enforce word count limit if (numHoles == options.maxNonTerm-1 && newWordCountS + (numHoles+1) > options.maxSymbolsSource) continue; // enforce min word count limit if (newWordCountS < options.minWords) continue; // hole must be subphrase of the source phrase // (may be violated if subphrase contains additional unaligned source word) if (startS > sourceHole.GetStart(0) || endS < sourceHole.GetEnd(0)) continue; // make sure target side does not overlap with another hole if (holeColl.OverlapSource(sourceHole)) continue; // if consecutive non-terminals are not allowed, also check for source if (!options.nonTermConsecSource && holeColl.ConsecSource(sourceHole) ) continue; // check that rule scope would not exceed limit if sourceHole // were added if (holeColl.Scope(sourceHole) > options.maxScope) continue; // require that at least one aligned word is left (unless there are no words at all) if (options.requireAlignedWord && (newWordCountS > 0 || newWordCountT > 0)) { HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin(); bool foundAlignedWord = false; // loop through all word positions for(int pos = startT; pos <= endT && !foundAlignedWord; pos++) { // new hole? moving on... if (pos == startHoleT) { pos = endHoleT; } // covered by hole? moving on... else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos) { pos = iterHoleList->GetEnd(1); ++iterHoleList; } // covered by word? check if it is aligned else { if (sentence.alignedToT[pos].size() > 0) foundAlignedWord = true; } } if (!foundAlignedWord) continue; } // update list of holes in this phrase pair HoleCollection copyHoleColl(holeColl); copyHoleColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0)); // now some checks that disallow this phrase pair, but not further recursion bool allowablePhrase = true; // maximum words count violation? if (newWordCountS + (numHoles+1) > options.maxSymbolsSource) allowablePhrase = false; if (newWordCountT + (numHoles+1) > options.maxSymbolsTarget) allowablePhrase = false; // passed all checks... if (allowablePhrase) printAllHieroPhrases(sentence, startT, endT, startS, endS, copyHoleColl); // recursively search for next hole int nextInitStartT = options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2; addHieroRule(sentence, startT, endT, startS, endS , ruleExist, copyHoleColl, numHoles + 1, nextInitStartT , newWordCountT, newWordCountS); } } } }
void ExtractTask::saveHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, LabelIndex &labelIndex, int countS) { WordIndex indexS, indexT; // to keep track of word positions in rule ExtractedRule rule( startT, endT, startS, endS ); // phrase labels string targetLabel; if (m_options.targetSyntax) { targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel(); } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) { targetLabel = "S"; } else { targetLabel = "X"; } string sourceLabel = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X"; // create non-terms on the source side preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex); // target if (m_options.pcfgScore) { double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore(); rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS) + " [" + targetLabel + "]"; rule.pcfgScore = std::exp(logPCFGScore); } else { double logPCFGScore = 0.0f; rule.target = saveTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS) + " [" + targetLabel + "]"; } // source rule.source = saveSourceHieroPhrase(startT, endT, startS, endS, holeColl, labelIndex); if (m_options.conditionOnTargetLhs) { rule.source += " [" + targetLabel + "]"; } else { rule.source += " [" + sourceLabel + "]"; } // alignment saveHieroAlignment(startT, endT, startS, endS, indexS, indexT, holeColl, rule); // context (words to left and right) if (m_options.flexScoreFlag) { rule.sourceContextLeft = startS == 0 ? "<s>" : m_sentence.source[startS-1]; rule.sourceContextRight = endS+1 == m_sentence.source.size() ? "<s>" : m_sentence.source[endS+1]; rule.targetContextLeft = startT == 0 ? "<s>" : m_sentence.target[startT-1]; rule.targetContextRight = endT+1 == m_sentence.target.size() ? "<s>" : m_sentence.target[endT+1]; rule.sourceHoleString = ""; rule.targetHoleString = ""; HoleList::const_iterator iterHole; for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) { const Hole &hole = *iterHole; rule.sourceHoleString += hole.GetLabel(0) + ": "; // rule starts with nonterminal; end of NT is considered left context if (hole.GetStart(0) == startS) { rule.sourceContextLeft = m_sentence.source[hole.GetEnd(0)]; } // rule ends with nonterminal; start of NT is considered right context else if (hole.GetEnd(0) == endS) { rule.sourceContextRight = m_sentence.source[hole.GetStart(0)]; } if (hole.GetStart(1) == startT) { rule.targetContextLeft = m_sentence.target[hole.GetEnd(1)]; } else if (hole.GetEnd(1) == endT) { rule.targetContextRight = m_sentence.target[hole.GetStart(1)]; } for (int i = hole.GetStart(0); i <= hole.GetEnd(0); ++i) { rule.sourceHoleString += m_sentence.source[i] + " "; } rule.targetHoleString += hole.GetLabel(1) + ": "; for (int i = hole.GetStart(1); i <= hole.GetEnd(1); ++i) { rule.targetHoleString += m_sentence.target[i] + " "; } } } addRuleToCollection( rule ); }