// Check for equal non-terminal alignment in case of SCFG rules. // Precondition: otherTargetToSourceAlignment has the same size as m_targetToSourceAlignments.begin()->first bool ExtractionPhrasePair::MatchesAlignment( ALIGNMENT *otherTargetToSourceAlignment ) const { if (!hierarchicalFlag) return true; // all or none of the phrasePair's word alignment matrices match, so just pick one const ALIGNMENT *thisTargetToSourceAlignment = m_targetToSourceAlignments.begin()->first; assert(m_phraseTarget->size() == thisTargetToSourceAlignment->size() + 1); assert(thisTargetToSourceAlignment->size() == otherTargetToSourceAlignment->size()); // loop over all symbols but the left hand side of the rule for (size_t i=0; i<thisTargetToSourceAlignment->size()-1; ++i) { if (isNonTerminal( vcbT.getWord( m_phraseTarget->at(i) ) )) { size_t thisAlign = *(thisTargetToSourceAlignment->at(i).begin()); size_t otherAlign = *(otherTargetToSourceAlignment->at(i).begin()); if (thisTargetToSourceAlignment->at(i).size() != 1 || otherTargetToSourceAlignment->at(i).size() != 1 || thisAlign != otherAlign) { return false; } } } return true; }
double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse for(int ti=0; ti<alignment->alignedToT.size(); ti++) { const set< size_t > & srcIndices = alignment->alignedToT[ ti ]; if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) { unaligned *= 2.718; } } return unaligned; }
inline void MapBackToStr(const vector<WORD_ID>& wid, vector<WORD>& tok, Vocabulary& vocab, vector<size_t>& NT_index){ tok.resize(wid.size()); NT_index.clear(); for(int i = 0; i< wid.size(); i++){ if(!ShouldIgnore(wid[i],vocab)){ tok[i] = vocab.getWord(wid[i]); } if(IsNT(wid[i],vocab)){ NT_index.push_back(i); } } }
void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &bestAlignment, ostream &out) { // output target symbols, except root, in rule table format for (int i = 0; i < phraseT.size()-1; ++i) { const std::string &word = vcbT.getWord(phraseT[i]); if (!stringToTreeFlag || !isNonTerminal(word)) { out << word << " "; continue; } // get corresponding source non-terminal and output pair std::set<size_t> alignmentPoints = bestAlignment.alignedToT[i]; assert(alignmentPoints.size() == 1); int j = *(alignmentPoints.begin()); if (inverseFlag) { out << word << vcbS.getWord(phraseS[j]) << " "; } else { out << vcbS.getWord(phraseS[j]) << word << " "; } } // output target root symbol out << vcbT.getWord(phraseT.back()); }
// check if two word alignments between a phrase pairs "match" // i.e. they do not differ in the alignment of non-termimals bool PhraseAlignment::match( const PhraseAlignment& other ) { if (other.target != target || other.source != source) return false; if (!hierarchicalFlag) return true; PHRASE phraseT = phraseTableT.getPhrase( target ); assert(phraseT.size() == alignedToT.size() + 1); assert(alignedToT.size() == other.alignedToT.size()); // loop over all words (note: 0 = left hand side of rule) for(size_t i=0;i<phraseT.size()-1;++i) if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) { if (alignedToT[i].size() != 1 || other.alignedToT[i].size() != 1 || *(alignedToT[i].begin()) != *(other.alignedToT[i].begin())) return false; } return true; }
void processPhrasePairs( vector< PhraseAlignment > &phrasePair ) { if (phrasePair.size() == 0) return; map<int, int> countE; map<int, int> alignmentE; int totalCount = 0; int currentCount = 0; int maxSameCount = 0; int maxSame = -1; int old = -1; for(size_t i=0; i<phrasePair.size(); i++) { if (i>0) { if (phrasePair[old].english == phrasePair[i].english) { if (! phrasePair[i].equals( phrasePair[old] )) { if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = i-1; } currentCount = 0; } } else { // wrap up old E if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = i-1; } alignmentE[ phrasePair[old].english ] = maxSame; // if (maxSameCount != totalCount) // cout << "max count is " << maxSameCount << "/" << totalCount << endl; // get ready for new E totalCount = 0; currentCount = 0; maxSameCount = 0; maxSame = -1; } } countE[ phrasePair[i].english ]++; old = i; currentCount++; totalCount++; } // wrap up old E if (currentCount > maxSameCount) { maxSameCount = currentCount; maxSame = phrasePair.size()-1; } alignmentE[ phrasePair[old].english ] = maxSame; // if (maxSameCount != totalCount) // cout << "max count is " << maxSameCount << "/" << totalCount << endl; // output table typedef map< int, int >::iterator II; PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign ); size_t index = 0; for(II i = countE.begin(); i != countE.end(); i++) { //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n"; //cerr << index << endl; // foreign phrase (unless inverse) if (! inverseFlag) { for(size_t j=0; j<phraseF.size(); j++) { phraseTableFile << vcbF.getWord( phraseF[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // english phrase PHRASE phraseE = phraseTableE.getPhrase( i->first ); for(size_t j=0; j<phraseE.size(); j++) { phraseTableFile << vcbE.getWord( phraseE[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; // foreign phrase (if inverse) if (inverseFlag) { for(size_t j=0; j<phraseF.size(); j++) { phraseTableFile << vcbF.getWord( phraseF[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // phrase pair frequency phraseTableFile << i->second; //source phrase pair frequency phraseTableFile << " " << phrasePair.size(); // source phrase length phraseTableFile << " " << phraseF.size(); // target phrase length phraseTableFile << " " << phraseE.size(); phraseTableFile << endl; index += i->second; } }
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile ) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0; for(size_t i=0; i<phrasePair.size(); i++) { count += phrasePair[i]->count; } // collect count of count statistics if (goodTuringFlag || kneserNeyFlag) { totalDistinct++; int countInt = count + 0.99999; if(countInt <= COC_MAX) countOfCounts[ countInt ]++; } // compute PCFG score float pcfgScore; if (pcfgFlag && !inverseFlag) { float pcfgSum = 0; for(size_t i=0; i<phrasePair.size(); ++i) { pcfgSum += phrasePair[i]->pcfgSum; } pcfgScore = pcfgSum / count; } // output phrases const PHRASE &phraseS = phrasePair[0]->GetSource(); const PHRASE &phraseT = phrasePair[0]->GetTarget(); // do not output if hierarchical and count below threshold if (hierarchicalFlag && count < minCountHierarchical) { for(int j=0; j<phraseS.size()-1; j++) { if (isNonTerminal(vcbS.getWord( phraseS[j] ))) return; } } // source phrase (unless inverse) if (! inverseFlag) { printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // target phrase printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; // source phrase (if inverse) if (inverseFlag) { printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore ); } // unaligned word penalty if (unalignedFlag) { double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } // unaligned function word penalty if (unalignedFWFlag) { double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } if (pcfgFlag && !inverseFlag) { // target-side PCFG score phraseTableFile << " " << pcfgScore; } phraseTableFile << " ||| "; // alignment info for non-terminals if (! inverseFlag) { if (hierarchicalFlag) { // always output alignment if hiero style, but only for non-terms assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); for(int j = 0; j < phraseT.size() - 1; j++) { if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { if (bestAlignment->alignedToT[ j ].size() != 1) { cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl; phraseTableFile.flush(); assert(bestAlignment->alignedToT[ j ].size() == 1); } int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); phraseTableFile << sourcePos << "-" << j << " "; } } } else if (wordAlignmentFlag) { // alignment info in pb model for(int j=0; j<bestAlignment->alignedToT.size(); j++) { const set< size_t > &aligned = bestAlignment->alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { phraseTableFile << *p << "-" << j << " "; } } } } // counts phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) phraseTableFile << " " << distinctCount; // nt lengths if (outputNTLengths) { phraseTableFile << " ||| "; if (!inverseFlag) { map<size_t, map<size_t, float> > sourceProb, targetProb; // 1st sourcePos, 2nd = length, 3rd = prob calcNTLengthProb(phrasePair, sourceProb, targetProb); outputNTLengthProbs(phraseTableFile, sourceProb, "S"); outputNTLengthProbs(phraseTableFile, targetProb, "T"); } } phraseTableFile << endl; }
void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount ) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0; for(size_t i=0;i<phrasePair.size();i++) { count += phrasePair[i]->count; } PHRASE phraseS = phraseTableS.getPhrase( phrasePair[0]->GetSource() ); PHRASE phraseT = phraseTableT.getPhrase( phrasePair[0]->GetTarget() ); // labels (if hierarchical) // source phrase (unless inverse) if (! inverseFlag) { for(int j=0;j<phraseS.size();j++) { phraseTableFile << vcbS.getWord( phraseS[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // target phrase for(int j=0;j<phraseT.size();j++) { phraseTableFile << vcbT.getWord( phraseT[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; // source phrase (if inverse) if (inverseFlag) { for(int j=0;j<phraseS.size();j++) { phraseTableFile << vcbS.getWord( phraseS[j] ); phraseTableFile << " "; } phraseTableFile << "||| "; } // phrase translation probability if (goodTuringFlag && count<GT_MAX) count *= discountFactor[(int)(count+0.99999)]; double condScore = count / totalCount; phraseTableFile << ( logProbFlag ? negLogProb*log(condScore) : condScore ); // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(lexScore) : lexScore ); } phraseTableFile << " ||| "; // alignment info for non-terminals if (! inverseFlag) { if (hierarchicalFlag) { // always output alignment if hiero style, but only for non-terms assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); for(int j = 0; j < phraseT.size() - 1; j++) { if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { assert(bestAlignment->alignedToT[ j ].size() == 1); int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); phraseTableFile << sourcePos << "-" << j << " "; } } } else if (wordAlignmentFlag) { // alignment info in pb model for(int j=0;j<bestAlignment->alignedToT.size();j++) { const set< size_t > &aligned = bestAlignment->alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { phraseTableFile << *p << "-" << j << " "; } } } } phraseTableFile << " ||| " << totalCount; phraseTableFile << endl; }
void outputPhrasePair(vector<PhraseAlignment*> &phrasePair, float totalCount, Bz2LineWriter& phraseTableFile) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0.; for(size_t i=0; i<phrasePair.size(); count += phrasePair[i++]->count); PHRASE phraseS = phraseTableS.getPhrase( phrasePair[0]->source ); PHRASE phraseT = phraseTableT.getPhrase( phrasePair[0]->target ); // labels (if hierarchical) // source phrase (unless inverse) if (!inverseFlag) { for (size_t j=0; j<phraseS.size(); phraseTableFile.writeLine(vcbS.getWord(phraseS[j++]) + " ")); phraseTableFile.writeLine("||| "); } // target phrase for (size_t j=0; j<phraseT.size(); phraseTableFile.writeLine(vcbT.getWord(phraseT[j++]) + " ")); phraseTableFile.writeLine("||| "); // source phrase (if inverse) if (inverseFlag) { for (size_t j=0; j<phraseS.size(); phraseTableFile.writeLine(vcbS.getWord(phraseS[j++]) + " ")); phraseTableFile.writeLine("||| "); } // alignment info for non-terminals if (!inverseFlag && hierarchicalFlag) { assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); for(size_t j = 0; j < phraseT.size() - 1; ++j) if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { assert(bestAlignment->alignedToT[ j ].size() == 1); stringstream data; data << *(bestAlignment->alignedToT[j].begin()) << "-" << j << " "; phraseTableFile.writeLine(data.str()); } phraseTableFile.writeLine("||| "); } // phrase translation probability if (goodTuringFlag && count<GT_MAX) count *= discountFactor[(int)(count+0.99999)]; { stringstream data; data << (logProbFlag ? negLogProb*log(count / totalCount) : count / totalCount); phraseTableFile.writeLine(data.str()); } // lexical translation probability if (lexFlag) { stringstream data; data << " " << (logProbFlag ? negLogProb*log(computeLexicalTranslation(phraseS, phraseT, bestAlignment)) : computeLexicalTranslation(phraseS, phraseT, bestAlignment)); phraseTableFile.writeLine(data.str()); } { stringstream data; data << " ||| " << totalCount << endl; phraseTableFile.writeLine(data.str()); } // optional output of word alignments if (!inverseFlag && wordAlignmentFlag) { // source phrase for(size_t j=0;j<phraseS.size(); wordAlignmentFile << vcbS.getWord(phraseS[j++]) << " "); wordAlignmentFile << "||| "; // target phrase for(size_t j=0;j<phraseT.size(); wordAlignmentFile << vcbT.getWord(phraseT[j++]) << " "); wordAlignmentFile << "|||"; // alignment for(size_t j=0;j<bestAlignment->alignedToT.size(); ++j) { const set< size_t > &aligned = bestAlignment->alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); wordAlignmentFile << " " << *(p++) << "-" << j); } wordAlignmentFile << endl; } }