コード例 #1
0
ファイル: score.cpp プロジェクト: xwd/mosesGit-hiero
void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
{
  if (phrasePair.size() == 0) return;

  // group phrase pairs based on alignments that matter
  // (i.e. that re-arrange non-terminals)
  PhrasePairGroup phrasePairGroup;
  
  float totalSource = 0;

  //cerr << "phrasePair.size() = " << phrasePair.size() << endl;
  
  // loop through phrase pairs
  for(size_t i=0; i<phrasePair.size(); i++) {
    // add to total count
    PhraseAlignment &currPhrasePair = phrasePair[i];
    
    totalSource += phrasePair[i].count;
    
    // check for matches
    //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
    
    PhraseAlignmentCollection phraseAlignColl;
    phraseAlignColl.push_back(&currPhrasePair);
    pair<PhrasePairGroup::iterator, bool> retInsert;
    retInsert = phrasePairGroup.insert(phraseAlignColl);
    if (!retInsert.second)
    { // already exist. Add to that collection instead
      PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
      existingColl.push_back(&currPhrasePair);
    }
    
  }

  // output the distinct phrase pairs, one at a time
  const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
  PhrasePairGroup::SortedColl::const_iterator iter;

  for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) 
  {
    const PhraseAlignmentCollection &group = **iter;
    outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );

  }
  
}
コード例 #2
0
ファイル: score.cpp プロジェクト: xwd/mosesGit-hiero
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
{
  if (phrasePair.size() == 0) return;

  PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
    
  // compute count
  float count = 0;
  for(size_t i=0; i<phrasePair.size(); i++) {
    count += phrasePair[i]->count;
  }

  // collect count of count statistics
  if (goodTuringFlag || kneserNeyFlag) {
    totalDistinct++;
    int countInt = count + 0.99999;
    if(countInt <= COC_MAX)
      countOfCounts[ countInt ]++;
  }

  // compute PCFG score
  float pcfgScore;
  if (pcfgFlag && !inverseFlag) {
    float pcfgSum = 0;
    for(size_t i=0; i<phrasePair.size(); ++i) {
        pcfgSum += phrasePair[i]->pcfgSum;
    }
    pcfgScore = pcfgSum / count;
  }

  // output phrases
  const PHRASE &phraseS = phrasePair[0]->GetSource();
  const PHRASE &phraseT = phrasePair[0]->GetTarget();

  // do not output if hierarchical and count below threshold
  if (hierarchicalFlag && count < minCountHierarchical) {
    for(int j=0; j<phraseS.size()-1; j++) {
      if (isNonTerminal(vcbS.getWord( phraseS[j] )))
        return;
    }
  }

  // source phrase (unless inverse)
  if (! inverseFlag) {
    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

  // target phrase
  printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
  phraseTableFile << " ||| ";

  // source phrase (if inverse)
  if (inverseFlag) {
    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

  // lexical translation probability
  if (lexFlag) {
    double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
    phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
  }

  // unaligned word penalty
  if (unalignedFlag) {
    double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
  }

  // unaligned function word penalty
  if (unalignedFWFlag) {
    double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
  }

  if (pcfgFlag && !inverseFlag) {
    // target-side PCFG score
    phraseTableFile << " " << pcfgScore;
  }

  phraseTableFile << " ||| ";

  // alignment info for non-terminals
  if (! inverseFlag) {
    if (hierarchicalFlag) {
      // always output alignment if hiero style, but only for non-terms
      assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
      for(int j = 0; j < phraseT.size() - 1; j++) {
        if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
          if (bestAlignment->alignedToT[ j ].size() != 1) {
            cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
            phraseTableFile.flush();
            assert(bestAlignment->alignedToT[ j ].size() == 1);
          }
          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
          phraseTableFile << sourcePos << "-" << j << " ";
        }
      }
    } else if (wordAlignmentFlag) {
      // alignment info in pb model
      for(int j=0; j<bestAlignment->alignedToT.size(); j++) {
        const set< size_t > &aligned = bestAlignment->alignedToT[j];
        for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
          phraseTableFile << *p << "-" << j << " ";
        }
      }
    }
  }

  // counts
  
  phraseTableFile << " ||| " << totalCount << " " << count;
  if (kneserNeyFlag) 
    phraseTableFile << " " << distinctCount;
  
  // nt lengths  
  if (outputNTLengths)
  {
    phraseTableFile << " ||| ";

    if (!inverseFlag)
    {
      map<size_t, map<size_t, float> > sourceProb, targetProb;
      // 1st sourcePos, 2nd = length, 3rd = prob

      calcNTLengthProb(phrasePair, sourceProb, targetProb);
      
      outputNTLengthProbs(phraseTableFile, sourceProb, "S");
      outputNTLengthProbs(phraseTableFile, targetProb, "T");
    }    
  }
  
  phraseTableFile << endl;
}
コード例 #3
0
ファイル: score.cpp プロジェクト: Avmb/mosesdecoder
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
{
  if (phrasePair.size() == 0) return;

  const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
    
  // compute count
  float count = 0;
  for(size_t i=0; i<phrasePair.size(); i++) {
    count += phrasePair[i]->count;
  }

  // compute domain counts
  map< string, float > domainCount;
  if (domainFlag) {
    for(size_t i=0; i<phrasePair.size(); i++) {
      string d = domain->getDomainOfSentence( phrasePair[i]->sentenceId );
      if (domainCount.find( d ) == domainCount.end())
        domainCount[ d ] = phrasePair[i]->count;
      else
        domainCount[ d ] += phrasePair[i]->count;
    }
  }

  // collect count of count statistics
  if (goodTuringFlag || kneserNeyFlag) {
    totalDistinct++;
    int countInt = count + 0.99999;
    if(countInt <= COC_MAX)
      countOfCounts[ countInt ]++;
  }

  // compute PCFG score
  float pcfgScore = 0;
  if (pcfgFlag && !inverseFlag) {
    float pcfgSum = 0;
    for(size_t i=0; i<phrasePair.size(); ++i) {
        pcfgSum += phrasePair[i]->pcfgSum;
    }
    pcfgScore = pcfgSum / count;
  }

  // output phrases
  const PHRASE &phraseS = phrasePair[0]->GetSource();
  const PHRASE &phraseT = phrasePair[0]->GetTarget();

  // do not output if hierarchical and count below threshold
  if (hierarchicalFlag && count < minCountHierarchical) {
    for(size_t j=0; j<phraseS.size()-1; j++) {
      if (isNonTerminal(vcbS.getWord( phraseS[j] )))
        return;
    }
  }

  // source phrase (unless inverse)
  if (! inverseFlag) {
    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

  // target phrase
  printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
  phraseTableFile << " ||| ";

  // source phrase (if inverse)
  if (inverseFlag) {
    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

  // lexical translation probability
  if (lexFlag) {
    double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
    phraseTableFile << maybeLogProb( lexScore );
  }

  // unaligned word penalty
  if (unalignedFlag) {
    double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
    phraseTableFile << " " << maybeLogProb( penalty );
  }

  // unaligned function word penalty
  if (unalignedFWFlag) {
    double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
    phraseTableFile << " " << maybeLogProb( penalty );
  }

  if (singletonFeature) {
    phraseTableFile << " " << (isSingleton ? 1 : 0);
  }
  
  if (crossedNonTerm && !inverseFlag) {
    phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
  }
  
  // target-side PCFG score
  if (pcfgFlag && !inverseFlag) {
    phraseTableFile << " " << maybeLogProb( pcfgScore );
  }

  // domain count features
  if (domainFlag) {
    if (domainSparseFlag) {
      // sparse, subset
      if (domainSubsetFlag) {
        typedef vector< string >::const_iterator I;
        phraseTableFile << " doms";
        for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
          if (domainCount.find( *i ) != domainCount.end() ) {
            phraseTableFile << "_" << *i;
          }
        }
        phraseTableFile << " 1";
      }
      // sparse, indicator or ratio
      else {
        typedef map< string, float >::const_iterator I;
        for (I i=domainCount.begin(); i != domainCount.end(); i++) {
          if (domainRatioFlag) {
            phraseTableFile << " domr_" << i->first << " " << (i->second / count);
          }
          else {
            phraseTableFile << " dom_" << i->first << " 1";
          }
        }
      }
    }
    // core, subset
    else if (domainSubsetFlag) {
      if (domain->list.size() > 6) {
        cerr << "ERROR: too many domains for core domain subset features\n";
        exit(1);
      }
      size_t bitmap = 0;
      for(size_t bit = 0; bit < domain->list.size(); bit++) {
        if (domainCount.find( domain->list[ bit ] ) != domainCount.end()) {
          bitmap += 1 << bit;
        }
      }
      for(size_t i = 1; i < (1 << domain->list.size()); i++) {
        phraseTableFile << " " << maybeLogProb( (bitmap == i) ? 2.718 : 1 );
      }
    }
    // core, indicator or ratio
    else {
      typedef vector< string >::const_iterator I;
      for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
        if (domainCount.find( *i ) == domainCount.end() ) {
          phraseTableFile << " " << maybeLogProb( 1 );
        }
        else if (domainRatioFlag) {
          phraseTableFile << " " << maybeLogProb( exp( domainCount[ *i ] / count ) );
        }
        else {
          phraseTableFile << " " << maybeLogProb( 2.718 );
        }
      }
    }
  }

  phraseTableFile << " ||| ";

  // alignment info for non-terminals
  if (! inverseFlag) {
    if (hierarchicalFlag) {
      // always output alignment if hiero style, but only for non-terms 
      // (eh: output all alignments, needed for some feature functions) 
      assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
      std::vector<std::string> alignment;
      for(size_t j = 0; j < phraseT.size() - 1; j++) {
        if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
          if (bestAlignment.alignedToT[ j ].size() != 1) {
            cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
            phraseTableFile.flush();
            assert(bestAlignment.alignedToT[ j ].size() == 1);
          }
          int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
          //phraseTableFile << sourcePos << "-" << j << " ";
          std::stringstream point;
          point << sourcePos << "-" << j;
          alignment.push_back(point.str());
        } else {
          set<size_t>::iterator setIter;
          for(setIter = (bestAlignment.alignedToT[j]).begin(); setIter != (bestAlignment.alignedToT[j]).end(); setIter++) {
            int sourcePos = *setIter;
            //phraseTableFile << sourcePos << "-" << j << " ";
            std::stringstream point;
            point << sourcePos << "-" << j;
            alignment.push_back(point.str());
           }
         }
       }
       // now print all alignments, sorted by source index
       sort(alignment.begin(), alignment.end());
       for (size_t i = 0; i < alignment.size(); ++i) {
          phraseTableFile << alignment[i] << " ";
       }
     } else if (wordAlignmentFlag) {
      // alignment info in pb model
      for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
        const set< size_t > &aligned = bestAlignment.alignedToT[j];
        for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
          phraseTableFile << *p << "-" << j << " ";
        }
      }
    }
  }


  // counts
  
  phraseTableFile << " ||| " << totalCount << " " << count;
  if (kneserNeyFlag) 
    phraseTableFile << " " << distinctCount;
  
  // nt lengths  
  if (outputNTLengths)
  {
    phraseTableFile << " ||| ";

    if (!inverseFlag)
    {
      map<size_t, map<size_t, float> > sourceProb, targetProb;
      // 1st sourcePos, 2nd = length, 3rd = prob

      calcNTLengthProb(phrasePair, sourceProb, targetProb);
      
      outputNTLengthProbs(phraseTableFile, sourceProb, "S");
      outputNTLengthProbs(phraseTableFile, targetProb, "T");
    }    
  }
  
  phraseTableFile << endl;
}