void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &bestAlignment, ostream &out) { // output target symbols, except root, in rule table format for (std::size_t i = 0; i < phraseT.size()-1; ++i) { const std::string &word = vcbT.getWord(phraseT[i]); if (!unpairedExtractFormatFlag || !isNonTerminal(word)) { out << word << " "; continue; } // get corresponding source non-terminal and output pair std::set<std::size_t> alignmentPoints = bestAlignment.alignedToT[i]; assert(alignmentPoints.size() == 1); int j = *(alignmentPoints.begin()); if (inverseFlag) { out << word << vcbS.getWord(phraseS[j]) << " "; } else { out << vcbS.getWord(phraseS[j]) << word << " "; } } // output target root symbol if (conditionOnTargetLhsFlag) { if (inverseFlag) { out << "[X]"; } else { out << vcbS.getWord(phraseS.back()); } } else { out << vcbT.getWord(phraseT.back()); } }
bool PhraseAlignment::equals( const PhraseAlignment& other ) { if (this == &other) return true; if (other.english != english) return false; if (other.foreign != foreign) return false; PHRASE phraseE = phraseTableE.getPhrase( english ); PHRASE phraseF = phraseTableF.getPhrase( foreign ); for(int i=0; i<phraseE.size(); i++) { if (alignedToE[i].size() != other.alignedToE[i].size()) return false; for(int j=0; j<alignedToE[i].size(); j++) { if (alignedToE[i][j] != other.alignedToE[i][j]) return false; } } for(int i=0; i<phraseF.size(); i++) { if (alignedToF[i].size() != other.alignedToF[i].size()) return false; for(int j=0; j<alignedToF[i].size(); j++) { if (alignedToF[i][j] != other.alignedToF[i][j]) return false; } } return true; }
// check if two word alignments between a phrase pairs "match" // i.e. they do not differ in the alignment of non-termimals bool PhraseAlignment::match( const PhraseAlignment& other ) { if (other.target != target || other.source != source) return false; if (!hierarchicalFlag) return true; PHRASE phraseT = phraseTableT.getPhrase( target ); assert(phraseT.size() == alignedToT.size() + 1); assert(alignedToT.size() == other.alignedToT.size()); // loop over all words (note: 0 = left hand side of rule) for(size_t i=0;i<phraseT.size()-1;++i) if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) { if (alignedToT[i].size() != 1 || other.alignedToT[i].size() != 1 || *(alignedToT[i].begin()) != *(other.alignedToT[i].begin())) return false; } return true; }
void printSourcePhrase(const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &bestAlignment, ostream &out) { // output source symbols, except root, in rule table format for (int i = 0; i < phraseS.size()-1; ++i) { const std::string &word = vcbS.getWord(phraseS[i]); if (!stringToTreeFlag || !isNonTerminal(word)) { out << word << " "; continue; } // get corresponding target non-terminal and output pair std::set<size_t> alignmentPoints = bestAlignment.alignedToS[i]; assert(alignmentPoints.size() == 1); int j = *(alignmentPoints.begin()); if (inverseFlag) { out << vcbT.getWord(phraseT[j]) << word << " "; } else { out << word << vcbT.getWord(phraseT[j]) << " "; } } // output source root symbol out << vcbS.getWord(phraseS.back()); }
void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) { if (phrasePair.size() == 0) return; map<int, int> countE; map<int, int> alignmentE; int totalCount = 0; int currentCount = 0; int maxSameCount = 0; int maxSame = -1; int old = -1; for(size_t i=0; i<phrasePair.size(); i++) { if (i>0) { if (phrasePair[old].english == phrasePair[i].english) { if (! phrasePair[i].equals( phrasePair[old] )) { if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = i-1; } currentCount = 0; } } else { // wrap up old E if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = i-1; } alignmentE[ phrasePair[old].english ] = maxSame; // if (maxSameCount != totalCount) // cout << "max count is " << maxSameCount << "/" << totalCount << endl; // get ready for new E totalCount = 0; currentCount = 0; maxSameCount = 0; maxSame = -1; } } countE[ phrasePair[i].english ]++; old = i; currentCount++; totalCount++; } // wrap up old E if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = phrasePair.size()-1; } alignmentE[ phrasePair[old].english ] = maxSame; // if (maxSameCount != totalCount) // cout << "max count is " << maxSameCount << "/" << totalCount << endl; // output table typedef map< int, int >::iterator II; PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign ); size_t index = 0; for(II i = countE.begin(); i != countE.end(); i++) { //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n"; //cerr << index << endl; // foreign phrase (unless inverse) if (! inverseFlag) { for(size_t j=0; j<phraseF.size(); j++) { phraseTableFile << vcbF.getWord( phraseF[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // english phrase PHRASE phraseE = phraseTableE.getPhrase( i->first ); for(size_t j=0; j<phraseE.size(); j++) { phraseTableFile << vcbE.getWord( phraseE[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; // foreign phrase (if inverse) if (inverseFlag) { for(size_t j=0; j<phraseF.size(); j++) { phraseTableFile << vcbF.getWord( phraseF[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // phrase pair frequency phraseTableFile << i->second; //source phrase pair frequency phraseTableFile << " " << phrasePair.size(); // source phrase length phraseTableFile << " " << phraseF.size(); // target phrase length phraseTableFile << " " << phraseE.size(); phraseTableFile << endl; index += i->second; } }
void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount ) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0; for(size_t i=0;i<phrasePair.size();i++) { count += phrasePair[i]->count; } PHRASE phraseS = phraseTableS.getPhrase( phrasePair[0]->GetSource() ); PHRASE phraseT = phraseTableT.getPhrase( phrasePair[0]->GetTarget() ); // labels (if hierarchical) // source phrase (unless inverse) if (! inverseFlag) { for(int j=0;j<phraseS.size();j++) { phraseTableFile << vcbS.getWord( phraseS[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // target phrase for(int j=0;j<phraseT.size();j++) { phraseTableFile << vcbT.getWord( phraseT[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; // source phrase (if inverse) if (inverseFlag) { for(int j=0;j<phraseS.size();j++) { phraseTableFile << vcbS.getWord( phraseS[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // phrase translation probability if (goodTuringFlag && count<GT_MAX) count *= discountFactor[(int)(count+0.99999)]; double condScore = count / totalCount; phraseTableFile << ( logProbFlag ? negLogProb*log(condScore) : condScore ); // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(lexScore) : lexScore ); } phraseTableFile << " ||| "; // alignment info for non-terminals if (! inverseFlag) { if (hierarchicalFlag) { // always output alignment if hiero style, but only for non-terms assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); for(int j = 0; j < phraseT.size() - 1; j++) { if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { assert(bestAlignment->alignedToT[ j ].size() == 1); int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); phraseTableFile << sourcePos << "-" << j << " "; } } } else if (wordAlignmentFlag) { // alignment info in pb model for(int j=0;j<bestAlignment->alignedToT.size();j++) { const set< size_t > &aligned = bestAlignment->alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { phraseTableFile << *p << "-" << j << " "; } } } } phraseTableFile << " ||| " << totalCount; phraseTableFile << endl; }
void outputPhrasePair(vector<PhraseAlignment*> &phrasePair, float totalCount, Bz2LineWriter& phraseTableFile) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0.; for(size_t i=0; i<phrasePair.size(); count += phrasePair[i++]->count); PHRASE phraseS = phraseTableS.getPhrase( phrasePair[0]->source ); PHRASE phraseT = phraseTableT.getPhrase( phrasePair[0]->target ); // labels (if hierarchical) // source phrase (unless inverse) if (!inverseFlag) { for (size_t j=0; j<phraseS.size(); phraseTableFile.writeLine(vcbS.getWord(phraseS[j++]) + " ")); phraseTableFile.writeLine("||| "); } // target phrase for (size_t j=0; j<phraseT.size(); phraseTableFile.writeLine(vcbT.getWord(phraseT[j++]) + " ")); phraseTableFile.writeLine("||| "); // source phrase (if inverse) if (inverseFlag) { for (size_t j=0; j<phraseS.size(); phraseTableFile.writeLine(vcbS.getWord(phraseS[j++]) + " ")); phraseTableFile.writeLine("||| "); } // alignment info for non-terminals if (!inverseFlag && hierarchicalFlag) { assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); for(size_t j = 0; j < phraseT.size() - 1; ++j) if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { assert(bestAlignment->alignedToT[ j ].size() == 1); stringstream data; data << *(bestAlignment->alignedToT[j].begin()) << "-" << j << " "; phraseTableFile.writeLine(data.str()); } phraseTableFile.writeLine("||| "); } // phrase translation probability if (goodTuringFlag && count<GT_MAX) count *= discountFactor[(int)(count+0.99999)]; { stringstream data; data << (logProbFlag ? negLogProb*log(count / totalCount) : count / totalCount); phraseTableFile.writeLine(data.str()); } // lexical translation probability if (lexFlag) { stringstream data; data << " " << (logProbFlag ? negLogProb*log(computeLexicalTranslation(phraseS, phraseT, bestAlignment)) : computeLexicalTranslation(phraseS, phraseT, bestAlignment)); phraseTableFile.writeLine(data.str()); } { stringstream data; data << " ||| " << totalCount << endl; phraseTableFile.writeLine(data.str()); } // optional output of word alignments if (!inverseFlag && wordAlignmentFlag) { // source phrase for(size_t j=0;j<phraseS.size(); wordAlignmentFile << vcbS.getWord(phraseS[j++]) << " "); wordAlignmentFile << "||| "; // target phrase for(size_t j=0;j<phraseT.size(); wordAlignmentFile << vcbT.getWord(phraseT[j++]) << " "); wordAlignmentFile << "|||"; // alignment for(size_t j=0;j<bestAlignment->alignedToT.size(); ++j) { const set< size_t > &aligned = bestAlignment->alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); wordAlignmentFile << " " << *(p++) << "-" << j); } wordAlignmentFile << endl; } }