void printAllHieroPhrases( SentenceAlignmentWithSyntax &sentence , int startT, int endT, int startS, int endS , HoleCollection &holeColl) { LabelIndex labelIndex,labelCount; // number of target head labels int numLabels = options.targetSyntax ? sentence.targetTree.GetNodes(startT,endT).size() : 1; labelCount.push_back(numLabels); labelIndex.push_back(0); // number of source head labels numLabels = options.sourceSyntax ? sentence.sourceTree.GetNodes(startS,endS).size() : 1; labelCount.push_back(numLabels); labelIndex.push_back(0); // number of target hole labels for( HoleList::const_iterator hole = holeColl.GetHoles().begin(); hole != holeColl.GetHoles().end(); hole++ ) { int numLabels = options.targetSyntax ? sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ; labelCount.push_back(numLabels); labelIndex.push_back(0); } // number of source hole labels holeColl.SortSourceHoles(); for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin(); i != holeColl.GetSortedSourceHoles().end(); i++ ) { const Hole &hole = **i; int numLabels = options.sourceSyntax ? sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ; labelCount.push_back(numLabels); labelIndex.push_back(0); } // loop through the holes bool done = false; while(!done) { printHieroPhrase( sentence, startT, endT, startS, endS, holeColl, labelIndex ); for(int i=0; i<labelIndex.size(); i++) { labelIndex[i]++; if(labelIndex[i] == labelCount[i]) { labelIndex[i] = 0; if (i == labelIndex.size()-1) done = true; } else { break; } } } }
// this function is called recursively // it pokes a new hole into the phrase pair, and then calls itself for more holes void addHieroRule( SentenceAlignmentWithSyntax &sentence , int startT, int endT, int startS, int endS , RuleExist &ruleExist, const HoleCollection &holeColl , int numHoles, int initStartT, int wordCountT, int wordCountS) { // done, if already the maximum number of non-terminals in phrase pair if (numHoles >= options.maxNonTerm) return; // find a hole... for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT) { for (int endHoleT = startHoleT+(options.minHoleTarget-1); endHoleT <= endT; ++endHoleT) { // if last non-terminal, enforce word count limit if (numHoles == options.maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > options.maxSymbolsTarget) continue; // determine the number of remaining target words const int newWordCountT = wordCountT - (endHoleT-startHoleT+1); // always enforce min word count limit if (newWordCountT < options.minWords) continue; // except the whole span if (startHoleT == startT && endHoleT == endT) continue; // does a phrase cover this target span? // if it does, then there should be a list of mapped source phrases // (multiple possible due to unaligned words) const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT); // loop over sub phrase pairs HoleList::const_iterator iterSourceHoles; for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles) { const Hole &sourceHole = *iterSourceHoles; const int sourceHoleSize = sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1; // enforce minimum hole size if (sourceHoleSize < options.minHoleSource) continue; // determine the number of remaining source words const int newWordCountS = wordCountS - sourceHoleSize; // if last non-terminal, enforce word count limit if (numHoles == options.maxNonTerm-1 && newWordCountS + (numHoles+1) > options.maxSymbolsSource) continue; // enforce min word count limit if (newWordCountS < options.minWords) continue; // hole must be subphrase of the source phrase // (may be violated if subphrase contains additional unaligned source word) if (startS > sourceHole.GetStart(0) || endS < sourceHole.GetEnd(0)) continue; // make sure target side does not overlap with another hole if (holeColl.OverlapSource(sourceHole)) continue; // if consecutive non-terminals are not allowed, also check for source if (!options.nonTermConsecSource && holeColl.ConsecSource(sourceHole) ) continue; // check that rule scope would not exceed limit if sourceHole // were added if (holeColl.Scope(sourceHole) > options.maxScope) continue; // require that at least one aligned word is left (unless there are no words at all) if (options.requireAlignedWord && (newWordCountS > 0 || newWordCountT > 0)) { HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin(); bool foundAlignedWord = false; // loop through all word positions for(int pos = startT; pos <= endT && !foundAlignedWord; pos++) { // new hole? moving on... if (pos == startHoleT) { pos = endHoleT; } // covered by hole? moving on... else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos) { pos = iterHoleList->GetEnd(1); ++iterHoleList; } // covered by word? check if it is aligned else { if (sentence.alignedToT[pos].size() > 0) foundAlignedWord = true; } } if (!foundAlignedWord) continue; } // update list of holes in this phrase pair HoleCollection copyHoleColl(holeColl); copyHoleColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0)); // now some checks that disallow this phrase pair, but not further recursion bool allowablePhrase = true; // maximum words count violation? if (newWordCountS + (numHoles+1) > options.maxSymbolsSource) allowablePhrase = false; if (newWordCountT + (numHoles+1) > options.maxSymbolsTarget) allowablePhrase = false; // passed all checks... if (allowablePhrase) printAllHieroPhrases(sentence, startT, endT, startS, endS, copyHoleColl); // recursively search for next hole int nextInitStartT = options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2; addHieroRule(sentence, startT, endT, startS, endS , ruleExist, copyHoleColl, numHoles + 1, nextInitStartT , newWordCountT, newWordCountS); } } } }
void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS) { LabelIndex labelIndex,labelCount; // number of target head labels int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1; if (m_options.targetSyntacticPreferences && !numLabels) { numLabels++; } labelCount.push_back(numLabels); labelIndex.push_back(0); // number of source head labels numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1; labelCount.push_back(numLabels); labelIndex.push_back(0); // number of target hole labels for( HoleList::const_iterator hole = holeColl.GetHoles().begin(); hole != holeColl.GetHoles().end(); hole++ ) { int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ; if (m_options.targetSyntacticPreferences && !numLabels) { numLabels++; } labelCount.push_back(numLabels); labelIndex.push_back(0); } // number of source hole labels holeColl.SortSourceHoles(); for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin(); i != holeColl.GetSortedSourceHoles().end(); i++ ) { const Hole &hole = **i; int numLabels = m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ; labelCount.push_back(numLabels); labelIndex.push_back(0); } // loop through the holes bool done = false; while(!done) { saveHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS ); for(unsigned int i=0; i<labelIndex.size(); i++) { labelIndex[i]++; if(labelIndex[i] == labelCount[i]) { labelIndex[i] = 0; if (i == labelIndex.size()-1) done = true; } else { break; } } } }