Пример #1
0
void ExtractTask::saveAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
{
  LabelIndex labelIndex,labelCount;

  // number of target head labels
  int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(startT,endT).size() : 1;
  if (m_options.targetSyntacticPreferences && !numLabels) {
    numLabels++;
  }
  labelCount.push_back(numLabels);
  labelIndex.push_back(0);

  // number of source head labels
  numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(startS,endS).size() : 1;
  labelCount.push_back(numLabels);
  labelIndex.push_back(0);

  // number of target hole labels
  for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
       hole != holeColl.GetHoles().end(); hole++ ) {
    int numLabels = m_options.targetSyntax ? m_sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
    if (m_options.targetSyntacticPreferences && !numLabels) {
      numLabels++;
    }
    labelCount.push_back(numLabels);
    labelIndex.push_back(0);
  }

  // number of source hole labels
  holeColl.SortSourceHoles();
  for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
       i != holeColl.GetSortedSourceHoles().end(); i++ ) {
    const Hole &hole = **i;
    int numLabels =  m_options.sourceSyntax ? m_sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
    labelCount.push_back(numLabels);
    labelIndex.push_back(0);
  }

  // loop through the holes
  bool done = false;
  while(!done) {
    saveHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
    for(unsigned int i=0; i<labelIndex.size(); i++) {
      labelIndex[i]++;
      if(labelIndex[i] == labelCount[i]) {
        labelIndex[i] = 0;
        if (i == labelIndex.size()-1)
          done = true;
      } else {
        break;
      }
    }
  }
}
Пример #2
0
void printAllHieroPhrases( SentenceAlignmentWithSyntax &sentence
                           , int startT, int endT, int startS, int endS
                           , HoleCollection &holeColl)
{
  LabelIndex labelIndex,labelCount;

  // number of target head labels
  int numLabels = options.targetSyntax ? sentence.targetTree.GetNodes(startT,endT).size() : 1;
  labelCount.push_back(numLabels);
  labelIndex.push_back(0);

  // number of source head labels
  numLabels =  options.sourceSyntax ? sentence.sourceTree.GetNodes(startS,endS).size() : 1;
  labelCount.push_back(numLabels);
  labelIndex.push_back(0);

  // number of target hole labels
  for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
       hole != holeColl.GetHoles().end(); hole++ ) {
    int numLabels =  options.targetSyntax ? sentence.targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
    labelCount.push_back(numLabels);
    labelIndex.push_back(0);
  }

  // number of source hole labels
  holeColl.SortSourceHoles();
  for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
       i != holeColl.GetSortedSourceHoles().end(); i++ ) {
    const Hole &hole = **i;
    int numLabels =  options.sourceSyntax ? sentence.sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
    labelCount.push_back(numLabels);
    labelIndex.push_back(0);
  }

  // loop through the holes
  bool done = false;
  while(!done) {
    printHieroPhrase( sentence, startT, endT, startS, endS, holeColl, labelIndex );
    for(int i=0; i<labelIndex.size(); i++) {
      labelIndex[i]++;
      if(labelIndex[i] == labelCount[i]) {
        labelIndex[i] = 0;
        if (i == labelIndex.size()-1)
          done = true;
      } else {
        break;
      }
    }
  }
}
Пример #3
0
// this function is called recursively
// it pokes a new hole into the phrase pair, and then calls itself for more holes
void addHieroRule( SentenceAlignmentWithSyntax &sentence
                   , int startT, int endT, int startS, int endS
                   , RuleExist &ruleExist, const HoleCollection &holeColl
                   , int numHoles, int initStartT, int wordCountT, int wordCountS)
{
  // done, if already the maximum number of non-terminals in phrase pair
  if (numHoles >= options.maxNonTerm)
    return;

  // find a hole...
  for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT) {
    for (int endHoleT = startHoleT+(options.minHoleTarget-1); endHoleT <= endT; ++endHoleT) {
      // if last non-terminal, enforce word count limit
      if (numHoles == options.maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > options.maxSymbolsTarget)
        continue;

      // determine the number of remaining target words
      const int newWordCountT = wordCountT - (endHoleT-startHoleT+1);

      // always enforce min word count limit
      if (newWordCountT < options.minWords)
        continue;

      // except the whole span
      if (startHoleT == startT && endHoleT == endT)
        continue;

      // does a phrase cover this target span?
      // if it does, then there should be a list of mapped source phrases
      // (multiple possible due to unaligned words)
      const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT);

      // loop over sub phrase pairs
      HoleList::const_iterator iterSourceHoles;
      for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles) {
        const Hole &sourceHole = *iterSourceHoles;

        const int sourceHoleSize = sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1;

        // enforce minimum hole size
        if (sourceHoleSize < options.minHoleSource)
          continue;

        // determine the number of remaining source words
        const int newWordCountS = wordCountS - sourceHoleSize;

        // if last non-terminal, enforce word count limit
        if (numHoles == options.maxNonTerm-1 && newWordCountS + (numHoles+1) > options.maxSymbolsSource)
          continue;

        // enforce min word count limit
        if (newWordCountS < options.minWords)
          continue;

        // hole must be subphrase of the source phrase
        // (may be violated if subphrase contains additional unaligned source word)
        if (startS > sourceHole.GetStart(0) || endS <  sourceHole.GetEnd(0))
          continue;

        // make sure target side does not overlap with another hole
        if (holeColl.OverlapSource(sourceHole))
          continue;

        // if consecutive non-terminals are not allowed, also check for source
        if (!options.nonTermConsecSource && holeColl.ConsecSource(sourceHole) )
          continue;

        // check that rule scope would not exceed limit if sourceHole
        // were added
        if (holeColl.Scope(sourceHole) > options.maxScope)
          continue;

        // require that at least one aligned word is left (unless there are no words at all)
        if (options.requireAlignedWord && (newWordCountS > 0 || newWordCountT > 0)) {
          HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
          bool foundAlignedWord = false;
          // loop through all word positions
          for(int pos = startT; pos <= endT && !foundAlignedWord; pos++) {
            // new hole? moving on...
            if (pos == startHoleT) {
              pos = endHoleT;
            }
            // covered by hole? moving on...
            else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos) {
              pos = iterHoleList->GetEnd(1);
              ++iterHoleList;
            }
            // covered by word? check if it is aligned
            else {
              if (sentence.alignedToT[pos].size() > 0)
                foundAlignedWord = true;
            }
          }
          if (!foundAlignedWord)
            continue;
        }

        // update list of holes in this phrase pair
        HoleCollection copyHoleColl(holeColl);
        copyHoleColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0));

        // now some checks that disallow this phrase pair, but not further recursion
        bool allowablePhrase = true;

        // maximum words count violation?
        if (newWordCountS + (numHoles+1) > options.maxSymbolsSource)
          allowablePhrase = false;

        if (newWordCountT + (numHoles+1) > options.maxSymbolsTarget)
          allowablePhrase = false;

        // passed all checks...
        if (allowablePhrase)
          printAllHieroPhrases(sentence, startT, endT, startS, endS, copyHoleColl);

        // recursively search for next hole
        int nextInitStartT = options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
        addHieroRule(sentence, startT, endT, startS, endS
                     , ruleExist, copyHoleColl, numHoles + 1, nextInitStartT
                     , newWordCountT, newWordCountS);
      }
    }
  }
}