// Check for equal non-terminal alignment in case of SCFG rules.
// Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first
bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const
{
  if (!hierarchicalFlag) return true;

  // all or none of the phrasePair's word alignment matrices match, so just pick one
  const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first;

  assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1);
  assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size());

  // loop over all symbols but the left hand side of the rule
  for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) {
    if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) {
      size_t thisAlign  = *(thisTargetToSourceAlignment->at(i).begin());
      size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin());

      if (thisTargetToSourceAlignment->at(i).size() != 1 ||
          otherTargetToSourceAlignment->at(i).size() != 1 ||
          thisAlign != otherAlign) {
        return false;
      }
    }
  }

  return true;
}
Beispiel #2
0
double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
{
  // unaligned word counter
  double unaligned = 1.0;
  // only checking target words - source words are caught when computing inverse
  for(int ti=0; ti<alignment->alignedToT.size(); ti++) {
    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
    if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
      unaligned *= 2.718;
    }
  }
  return unaligned;
}
	inline void MapBackToStr(const vector<WORD_ID>& wid, vector<WORD>& tok, Vocabulary& vocab, vector<size_t>& NT_index){
		tok.resize(wid.size());
		NT_index.clear();
		for(int i = 0; i< wid.size(); i++){
			if(!ShouldIgnore(wid[i],vocab)){
				tok[i] = vocab.getWord(wid[i]);
			}

			if(IsNT(wid[i],vocab)){
				NT_index.push_back(i);
			}
		}
	}
Beispiel #4
0
void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT,
                       const PhraseAlignment &bestAlignment, ostream &out)
{
  // output target symbols, except root, in rule table format
  for (int i = 0; i < phraseT.size()-1; ++i) {
    const std::string &word = vcbT.getWord(phraseT[i]);
    if (!stringToTreeFlag || !isNonTerminal(word)) {
      out << word << " ";
      continue;
    }
    // get corresponding source non-terminal and output pair
    std::set<size_t> alignmentPoints = bestAlignment.alignedToT[i];
    assert(alignmentPoints.size() == 1);
    int j = *(alignmentPoints.begin());
    if (inverseFlag) {
      out << word << vcbS.getWord(phraseS[j]) << " ";
    } else {
      out << vcbS.getWord(phraseS[j]) << word << " ";
    }
  }

  // output target root symbol
  out << vcbT.getWord(phraseT.back());
}
// check if two word alignments between a phrase pairs "match"
// i.e. they do not differ in the alignment of non-termimals
bool PhraseAlignment::match( const PhraseAlignment& other )
{
	if (other.target != target || other.source != source) return false;
	if (!hierarchicalFlag) return true;

	PHRASE phraseT = phraseTableT.getPhrase( target );

  assert(phraseT.size() == alignedToT.size() + 1);
  assert(alignedToT.size() == other.alignedToT.size());

	// loop over all words (note: 0 = left hand side of rule)
	for(size_t i=0;i<phraseT.size()-1;++i)
		if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
			if (alignedToT[i].size() != 1 ||
			    other.alignedToT[i].size() != 1 ||
		    	    *(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
				return false;
		}

	return true;
}
void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
{
  if (phrasePair.size() == 0) return;
  map<int, int> countE;
  map<int, int> alignmentE;
  int totalCount = 0;
  int currentCount = 0;
  int maxSameCount = 0;
  int maxSame = -1;
  int old = -1;
  for(size_t i=0; i<phrasePair.size(); i++) {
    if (i>0) {
      if (phrasePair[old].english == phrasePair[i].english) {
        if (! phrasePair[i].equals( phrasePair[old] )) {
          if (currentCount > maxSameCount) {
            maxSameCount = currentCount;
            maxSame = i-1;
          }
          currentCount = 0;
        }
      } else {
        // wrap up old E
        if (currentCount > maxSameCount) {
          maxSameCount = currentCount;
          maxSame = i-1;
        }

        alignmentE[ phrasePair[old].english ] = maxSame;
        //	if (maxSameCount != totalCount)
        //  cout << "max count is " << maxSameCount << "/" << totalCount << endl;

        // get ready for new E
        totalCount = 0;
        currentCount = 0;
        maxSameCount = 0;
        maxSame = -1;
      }
    }
    countE[ phrasePair[i].english ]++;
    old = i;
    currentCount++;
    totalCount++;
  }

  // wrap up old E
  if (currentCount > maxSameCount) {
    maxSameCount = currentCount;
    maxSame = phrasePair.size()-1;
  }
  alignmentE[ phrasePair[old].english ] = maxSame;
  //  if (maxSameCount != totalCount)
  //    cout << "max count is " << maxSameCount << "/" << totalCount << endl;

  // output table
  typedef map< int, int >::iterator II;
  PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
  size_t index = 0;
  for(II i = countE.begin(); i != countE.end(); i++) {
    //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
    //cerr << index << endl;

    // foreign phrase (unless inverse)
    if (! inverseFlag) {
      for(size_t j=0; j<phraseF.size(); j++) {
        phraseTableFile << vcbF.getWord( phraseF[j] );
        phraseTableFile << " ";
      }
      phraseTableFile << "||| ";
    }

    // english phrase
    PHRASE phraseE = phraseTableE.getPhrase( i->first );
    for(size_t j=0; j<phraseE.size(); j++) {
      phraseTableFile << vcbE.getWord( phraseE[j] );
      phraseTableFile << " ";
    }
    phraseTableFile << "||| ";

    // foreign phrase (if inverse)
    if (inverseFlag) {
      for(size_t j=0; j<phraseF.size(); j++) {
        phraseTableFile << vcbF.getWord( phraseF[j] );
        phraseTableFile << " ";
      }
      phraseTableFile << "||| ";
    }

    // phrase pair frequency
    phraseTableFile << i->second;

    //source phrase pair frequency
    phraseTableFile << " " << phrasePair.size();

    // source phrase length
    phraseTableFile	<< " " << phraseF.size();

    // target phrase length
    phraseTableFile	<< " " << phraseE.size();

    phraseTableFile << endl;

    index += i->second;
  }
}
Beispiel #7
0
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
{
  if (phrasePair.size() == 0) return;

  PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
    
  // compute count
  float count = 0;
  for(size_t i=0; i<phrasePair.size(); i++) {
    count += phrasePair[i]->count;
  }

  // collect count of count statistics
  if (goodTuringFlag || kneserNeyFlag) {
    totalDistinct++;
    int countInt = count + 0.99999;
    if(countInt <= COC_MAX)
      countOfCounts[ countInt ]++;
  }

  // compute PCFG score
  float pcfgScore;
  if (pcfgFlag && !inverseFlag) {
    float pcfgSum = 0;
    for(size_t i=0; i<phrasePair.size(); ++i) {
        pcfgSum += phrasePair[i]->pcfgSum;
    }
    pcfgScore = pcfgSum / count;
  }

  // output phrases
  const PHRASE &phraseS = phrasePair[0]->GetSource();
  const PHRASE &phraseT = phrasePair[0]->GetTarget();

  // do not output if hierarchical and count below threshold
  if (hierarchicalFlag && count < minCountHierarchical) {
    for(int j=0; j<phraseS.size()-1; j++) {
      if (isNonTerminal(vcbS.getWord( phraseS[j] )))
        return;
    }
  }

  // source phrase (unless inverse)
  if (! inverseFlag) {
    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

  // target phrase
  printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
  phraseTableFile << " ||| ";

  // source phrase (if inverse)
  if (inverseFlag) {
    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
    phraseTableFile << " ||| ";
  }

  // lexical translation probability
  if (lexFlag) {
    double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
    phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
  }

  // unaligned word penalty
  if (unalignedFlag) {
    double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
  }

  // unaligned function word penalty
  if (unalignedFWFlag) {
    double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
  }

  if (pcfgFlag && !inverseFlag) {
    // target-side PCFG score
    phraseTableFile << " " << pcfgScore;
  }

  phraseTableFile << " ||| ";

  // alignment info for non-terminals
  if (! inverseFlag) {
    if (hierarchicalFlag) {
      // always output alignment if hiero style, but only for non-terms
      assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
      for(int j = 0; j < phraseT.size() - 1; j++) {
        if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
          if (bestAlignment->alignedToT[ j ].size() != 1) {
            cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
            phraseTableFile.flush();
            assert(bestAlignment->alignedToT[ j ].size() == 1);
          }
          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
          phraseTableFile << sourcePos << "-" << j << " ";
        }
      }
    } else if (wordAlignmentFlag) {
      // alignment info in pb model
      for(int j=0; j<bestAlignment->alignedToT.size(); j++) {
        const set< size_t > &aligned = bestAlignment->alignedToT[j];
        for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
          phraseTableFile << *p << "-" << j << " ";
        }
      }
    }
  }

  // counts
  
  phraseTableFile << " ||| " << totalCount << " " << count;
  if (kneserNeyFlag) 
    phraseTableFile << " " << distinctCount;
  
  // nt lengths  
  if (outputNTLengths)
  {
    phraseTableFile << " ||| ";

    if (!inverseFlag)
    {
      map<size_t, map<size_t, float> > sourceProb, targetProb;
      // 1st sourcePos, 2nd = length, 3rd = prob

      calcNTLengthProb(phrasePair, sourceProb, targetProb);
      
      outputNTLengthProbs(phraseTableFile, sourceProb, "S");
      outputNTLengthProbs(phraseTableFile, targetProb, "T");
    }    
  }
  
  phraseTableFile << endl;
}
Beispiel #8
0
void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount ) 
{
  if (phrasePair.size() == 0) return;

	PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );

	// compute count
	float count = 0;
	for(size_t i=0;i<phrasePair.size();i++)
	{
		count += phrasePair[i]->count;
	}

	PHRASE phraseS = phraseTableS.getPhrase( phrasePair[0]->GetSource() );
	PHRASE phraseT = phraseTableT.getPhrase( phrasePair[0]->GetTarget() );

	// labels (if hierarchical)

	// source phrase (unless inverse)
	if (! inverseFlag) 
	{
		for(int j=0;j<phraseS.size();j++)
		{
			phraseTableFile << vcbS.getWord( phraseS[j] );
			phraseTableFile << " ";
		}
		phraseTableFile << "||| ";
	}
	
	// target phrase
	for(int j=0;j<phraseT.size();j++)
	{
		phraseTableFile << vcbT.getWord( phraseT[j] );
		phraseTableFile << " ";
	}
	phraseTableFile << "||| ";
	
	// source phrase (if inverse)
	if (inverseFlag) 
	{
		for(int j=0;j<phraseS.size();j++)
		{
			phraseTableFile << vcbS.getWord( phraseS[j] );
			phraseTableFile << " ";
		}
		phraseTableFile << "||| ";
	}

	// phrase translation probability
	if (goodTuringFlag && count<GT_MAX)
		count *= discountFactor[(int)(count+0.99999)];
	double condScore = count / totalCount;	
	phraseTableFile << ( logProbFlag ? negLogProb*log(condScore) : condScore );
	
	// lexical translation probability
	if (lexFlag)
	{
		double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
		phraseTableFile << " " << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
	}
	
	phraseTableFile << " ||| ";

	// alignment info for non-terminals
	if (! inverseFlag)
	{
		if (hierarchicalFlag) 
		{ // always output alignment if hiero style, but only for non-terms
			assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
			for(int j = 0; j < phraseT.size() - 1; j++)
			{
				if (isNonTerminal(vcbT.getWord( phraseT[j] )))
				{
					assert(bestAlignment->alignedToT[ j ].size() == 1);
					int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
					phraseTableFile << sourcePos << "-" << j << " ";
				}
			}
		}
		else if (wordAlignmentFlag)
		{ // alignment info in pb model
			for(int j=0;j<bestAlignment->alignedToT.size();j++)
			{
				const set< size_t > &aligned = bestAlignment->alignedToT[j];
				for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p)
				{
					phraseTableFile << *p << "-" << j << " ";
				}
			}
		}
	}

	phraseTableFile << " ||| " << totalCount;
	phraseTableFile << endl;
}
void outputPhrasePair(vector<PhraseAlignment*> &phrasePair, float totalCount, Bz2LineWriter& phraseTableFile) {
  if (phrasePair.size() == 0)
		return;

	PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );

	// compute count
	float count = 0.;
	for(size_t i=0; i<phrasePair.size(); count += phrasePair[i++]->count);

	PHRASE phraseS = phraseTableS.getPhrase( phrasePair[0]->source );
	PHRASE phraseT = phraseTableT.getPhrase( phrasePair[0]->target );

	// labels (if hierarchical)

	// source phrase (unless inverse)
	if (!inverseFlag) {
		for (size_t j=0; j<phraseS.size(); phraseTableFile.writeLine(vcbS.getWord(phraseS[j++]) + " "));
		phraseTableFile.writeLine("||| ");
	}
	
	// target phrase
	for (size_t j=0; j<phraseT.size(); phraseTableFile.writeLine(vcbT.getWord(phraseT[j++]) + " "));
	phraseTableFile.writeLine("||| ");
	
	// source phrase (if inverse)
	if (inverseFlag) {
		for (size_t j=0; j<phraseS.size(); phraseTableFile.writeLine(vcbS.getWord(phraseS[j++]) + " "));
		phraseTableFile.writeLine("||| ");
	}
	
	// alignment info for non-terminals
	if (!inverseFlag && hierarchicalFlag) {
    assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
		for(size_t j = 0; j < phraseT.size() - 1; ++j)
			if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
        assert(bestAlignment->alignedToT[ j ].size() == 1);
				stringstream data;
				data << *(bestAlignment->alignedToT[j].begin()) << "-" << j << " ";
				phraseTableFile.writeLine(data.str());
			}
		phraseTableFile.writeLine("||| ");
	}

	// phrase translation probability
	if (goodTuringFlag && count<GT_MAX)
		count *= discountFactor[(int)(count+0.99999)];
	
	{
		stringstream data;
		data << (logProbFlag ? negLogProb*log(count / totalCount) : count / totalCount);
		phraseTableFile.writeLine(data.str());
	}
	
	// lexical translation probability
	if (lexFlag) {
		stringstream data;
		data << " " << (logProbFlag ?
										negLogProb*log(computeLexicalTranslation(phraseS, phraseT, bestAlignment)) :
										computeLexicalTranslation(phraseS, phraseT, bestAlignment));
		phraseTableFile.writeLine(data.str());
	}

	{
		stringstream data;
		data << " ||| " << totalCount << endl;
		phraseTableFile.writeLine(data.str());
	}

	// optional output of word alignments
	if (!inverseFlag && wordAlignmentFlag) {
		// source phrase
		for(size_t j=0;j<phraseS.size(); wordAlignmentFile << vcbS.getWord(phraseS[j++]) << " ");
		wordAlignmentFile << "||| ";
	
		// target phrase
		for(size_t j=0;j<phraseT.size(); wordAlignmentFile << vcbT.getWord(phraseT[j++]) << " ");
		wordAlignmentFile << "|||";

		// alignment
		for(size_t j=0;j<bestAlignment->alignedToT.size(); ++j) {
			const set< size_t > &aligned = bestAlignment->alignedToT[j];
      for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); wordAlignmentFile << " " << *(p++) << "-" << j);
		}
		wordAlignmentFile << endl;
	}
}