void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile ) { if (phrasePair.size() == 0) return; // group phrase pairs based on alignments that matter // (i.e. that re-arrange non-terminals) PhrasePairGroup phrasePairGroup; float totalSource = 0; //cerr << "phrasePair.size() = " << phrasePair.size() << endl; // loop through phrase pairs for(size_t i=0; i<phrasePair.size(); i++) { // add to total count PhraseAlignment &currPhrasePair = phrasePair[i]; totalSource += phrasePair[i].count; // check for matches //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl; PhraseAlignmentCollection phraseAlignColl; phraseAlignColl.push_back(&currPhrasePair); pair<PhrasePairGroup::iterator, bool> retInsert; retInsert = phrasePairGroup.insert(phraseAlignColl); if (!retInsert.second) { // already exist. Add to that collection instead PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first); existingColl.push_back(&currPhrasePair); } } // output the distinct phrase pairs, one at a time const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl(); PhrasePairGroup::SortedColl::const_iterator iter; for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) { const PhraseAlignmentCollection &group = **iter; outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile ); } }
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile ) { if (phrasePair.size() == 0) return; PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0; for(size_t i=0; i<phrasePair.size(); i++) { count += phrasePair[i]->count; } // collect count of count statistics if (goodTuringFlag || kneserNeyFlag) { totalDistinct++; int countInt = count + 0.99999; if(countInt <= COC_MAX) countOfCounts[ countInt ]++; } // compute PCFG score float pcfgScore; if (pcfgFlag && !inverseFlag) { float pcfgSum = 0; for(size_t i=0; i<phrasePair.size(); ++i) { pcfgSum += phrasePair[i]->pcfgSum; } pcfgScore = pcfgSum / count; } // output phrases const PHRASE &phraseS = phrasePair[0]->GetSource(); const PHRASE &phraseT = phrasePair[0]->GetTarget(); // do not output if hierarchical and count below threshold if (hierarchicalFlag && count < minCountHierarchical) { for(int j=0; j<phraseS.size()-1; j++) { if (isNonTerminal(vcbS.getWord( phraseS[j] ))) return; } } // source phrase (unless inverse) if (! inverseFlag) { printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // target phrase printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; // source phrase (if inverse) if (inverseFlag) { printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore ); } // unaligned word penalty if (unalignedFlag) { double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } // unaligned function word penalty if (unalignedFWFlag) { double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } if (pcfgFlag && !inverseFlag) { // target-side PCFG score phraseTableFile << " " << pcfgScore; } phraseTableFile << " ||| "; // alignment info for non-terminals if (! inverseFlag) { if (hierarchicalFlag) { // always output alignment if hiero style, but only for non-terms assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); for(int j = 0; j < phraseT.size() - 1; j++) { if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { if (bestAlignment->alignedToT[ j ].size() != 1) { cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl; phraseTableFile.flush(); assert(bestAlignment->alignedToT[ j ].size() == 1); } int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); phraseTableFile << sourcePos << "-" << j << " "; } } } else if (wordAlignmentFlag) { // alignment info in pb model for(int j=0; j<bestAlignment->alignedToT.size(); j++) { const set< size_t > &aligned = bestAlignment->alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { phraseTableFile << *p << "-" << j << " "; } } } } // counts phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) phraseTableFile << " " << distinctCount; // nt lengths if (outputNTLengths) { phraseTableFile << " ||| "; if (!inverseFlag) { map<size_t, map<size_t, float> > sourceProb, targetProb; // 1st sourcePos, 2nd = length, 3rd = prob calcNTLengthProb(phrasePair, sourceProb, targetProb); outputNTLengthProbs(phraseTableFile, sourceProb, "S"); outputNTLengthProbs(phraseTableFile, targetProb, "T"); } } phraseTableFile << endl; }
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton ) { if (phrasePair.size() == 0) return; const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0; for(size_t i=0; i<phrasePair.size(); i++) { count += phrasePair[i]->count; } // compute domain counts map< string, float > domainCount; if (domainFlag) { for(size_t i=0; i<phrasePair.size(); i++) { string d = domain->getDomainOfSentence( phrasePair[i]->sentenceId ); if (domainCount.find( d ) == domainCount.end()) domainCount[ d ] = phrasePair[i]->count; else domainCount[ d ] += phrasePair[i]->count; } } // collect count of count statistics if (goodTuringFlag || kneserNeyFlag) { totalDistinct++; int countInt = count + 0.99999; if(countInt <= COC_MAX) countOfCounts[ countInt ]++; } // compute PCFG score float pcfgScore = 0; if (pcfgFlag && !inverseFlag) { float pcfgSum = 0; for(size_t i=0; i<phrasePair.size(); ++i) { pcfgSum += phrasePair[i]->pcfgSum; } pcfgScore = pcfgSum / count; } // output phrases const PHRASE &phraseS = phrasePair[0]->GetSource(); const PHRASE &phraseT = phrasePair[0]->GetTarget(); // do not output if hierarchical and count below threshold if (hierarchicalFlag && count < minCountHierarchical) { for(size_t j=0; j<phraseS.size()-1; j++) { if (isNonTerminal(vcbS.getWord( phraseS[j] ))) return; } } // source phrase (unless inverse) if (! inverseFlag) { printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // target phrase printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile); phraseTableFile << " ||| "; // source phrase (if inverse) if (inverseFlag) { printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // lexical translation probability if (lexFlag) { double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment); phraseTableFile << maybeLogProb( lexScore ); } // unaligned word penalty if (unalignedFlag) { double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << maybeLogProb( penalty ); } // unaligned function word penalty if (unalignedFWFlag) { double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment); phraseTableFile << " " << maybeLogProb( penalty ); } if (singletonFeature) { phraseTableFile << " " << (isSingleton ? 1 : 0); } if (crossedNonTerm && !inverseFlag) { phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment); } // target-side PCFG score if (pcfgFlag && !inverseFlag) { phraseTableFile << " " << maybeLogProb( pcfgScore ); } // domain count features if (domainFlag) { if (domainSparseFlag) { // sparse, subset if (domainSubsetFlag) { typedef vector< string >::const_iterator I; phraseTableFile << " doms"; for (I i = domain->list.begin(); i != domain->list.end(); i++ ) { if (domainCount.find( *i ) != domainCount.end() ) { phraseTableFile << "_" << *i; } } phraseTableFile << " 1"; } // sparse, indicator or ratio else { typedef map< string, float >::const_iterator I; for (I i=domainCount.begin(); i != domainCount.end(); i++) { if (domainRatioFlag) { phraseTableFile << " domr_" << i->first << " " << (i->second / count); } else { phraseTableFile << " dom_" << i->first << " 1"; } } } } // core, subset else if (domainSubsetFlag) { if (domain->list.size() > 6) { cerr << "ERROR: too many domains for core domain subset features\n"; exit(1); } size_t bitmap = 0; for(size_t bit = 0; bit < domain->list.size(); bit++) { if (domainCount.find( domain->list[ bit ] ) != domainCount.end()) { bitmap += 1 << bit; } } for(size_t i = 1; i < (1 << domain->list.size()); i++) { phraseTableFile << " " << maybeLogProb( (bitmap == i) ? 2.718 : 1 ); } } // core, indicator or ratio else { typedef vector< string >::const_iterator I; for (I i = domain->list.begin(); i != domain->list.end(); i++ ) { if (domainCount.find( *i ) == domainCount.end() ) { phraseTableFile << " " << maybeLogProb( 1 ); } else if (domainRatioFlag) { phraseTableFile << " " << maybeLogProb( exp( domainCount[ *i ] / count ) ); } else { phraseTableFile << " " << maybeLogProb( 2.718 ); } } } } phraseTableFile << " ||| "; // alignment info for non-terminals if (! inverseFlag) { if (hierarchicalFlag) { // always output alignment if hiero style, but only for non-terms // (eh: output all alignments, needed for some feature functions) assert(phraseT.size() == bestAlignment.alignedToT.size() + 1); std::vector<std::string> alignment; for(size_t j = 0; j < phraseT.size() - 1; j++) { if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { if (bestAlignment.alignedToT[ j ].size() != 1) { cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl; phraseTableFile.flush(); assert(bestAlignment.alignedToT[ j ].size() == 1); } int sourcePos = *(bestAlignment.alignedToT[ j ].begin()); //phraseTableFile << sourcePos << "-" << j << " "; std::stringstream point; point << sourcePos << "-" << j; alignment.push_back(point.str()); } else { set<size_t>::iterator setIter; for(setIter = (bestAlignment.alignedToT[j]).begin(); setIter != (bestAlignment.alignedToT[j]).end(); setIter++) { int sourcePos = *setIter; //phraseTableFile << sourcePos << "-" << j << " "; std::stringstream point; point << sourcePos << "-" << j; alignment.push_back(point.str()); } } } // now print all alignments, sorted by source index sort(alignment.begin(), alignment.end()); for (size_t i = 0; i < alignment.size(); ++i) { phraseTableFile << alignment[i] << " "; } } else if (wordAlignmentFlag) { // alignment info in pb model for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) { const set< size_t > &aligned = bestAlignment.alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { phraseTableFile << *p << "-" << j << " "; } } } } // counts phraseTableFile << " ||| " << totalCount << " " << count; if (kneserNeyFlag) phraseTableFile << " " << distinctCount; // nt lengths if (outputNTLengths) { phraseTableFile << " ||| "; if (!inverseFlag) { map<size_t, map<size_t, float> > sourceProb, targetProb; // 1st sourcePos, 2nd = length, 3rd = prob calcNTLengthProb(phrasePair, sourceProb, targetProb); outputNTLengthProbs(phraseTableFile, sourceProb, "S"); outputNTLengthProbs(phraseTableFile, targetProb, "T"); } } phraseTableFile << endl; }