int PhraseAlignment::Compare(const PhraseAlignment &other) const { if (this == &other) // comparing with itself return 0; if (GetTarget() != other.GetTarget()) return ( GetTarget() < other.GetTarget() ) ? -1 : +1; if (GetSource() != other.GetSource()) return ( GetSource() < other.GetSource() ) ? -1 : +1; if (!hierarchicalFlag) return 0; // loop over all words (note: 0 = left hand side of rule) for(size_t i=0; i<phraseT.size()-1; i++) { if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) { size_t thisAlign = *(alignedToT[i].begin()); size_t otherAlign = *(other.alignedToT[i].begin()); if (alignedToT[i].size() != 1 || other.alignedToT[i].size() != 1 || thisAlign != otherAlign) { int ret = (thisAlign < otherAlign) ? -1 : +1; return ret; } } } return 0; }
// check if two word alignments between a phrase pair are the same bool PhraseAlignment::equals( const PhraseAlignment& other ) { if (this == &other) return true; if (other.GetTarget() != GetTarget()) return false; if (other.GetSource() != GetSource()) return false; if (other.alignedToT != alignedToT) return false; if (other.alignedToS != alignedToS) return false; return true; }
int PhraseAlignment::Compare(const PhraseAlignment &other) const { if (this == &other) // comparing with itself return 0; if (GetTarget() != other.GetTarget()) //先比的是目标端 return ( GetTarget() < other.GetTarget() ) ? -1 : +1; if (GetSource() != other.GetSource()) return ( GetSource() < other.GetSource() ) ? -1 : +1; return 0; }
// check if two word alignments between a phrase pair are the same bool PhraseAlignment::equals( const PhraseAlignment& other ) { if (this == &other) return true; if (other.GetRuleId() != this->GetRuleId() ) return false; if (other.alignedToT != alignedToT) return false; if (other.alignedToS != alignedToS) return false; return true; }
// check if two word alignments between a phrase pairs "match" // i.e. they do not differ in the alignment of non-termimals bool PhraseAlignment::match( const PhraseAlignment& other ) { if (this == &other) return true; if (other.GetTarget() != GetTarget()) return false; if (other.GetSource() != GetSource()) return false; if (!hierarchicalFlag) return true; assert(phraseT.size() == alignedToT.size() + 1); assert(alignedToT.size() == other.alignedToT.size()); // loop over all words (note: 0 = left hand side of rule) for(size_t i=0; i<phraseT.size()-1; i++) { if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) { if (alignedToT[i].size() != 1 || other.alignedToT[i].size() != 1 || *(alignedToT[i].begin()) != *(other.alignedToT[i].begin())) return false; } } return true; }
int main(int argc, char* argv[]) { cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n" << "modifying PhraseScore v1.4 written by Philipp Koehn\n" << "It computes statistics for extracted phrase pairs\n" << "if (direct):\n" << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n" << "if (inverse)\n" << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n"; if (argc != 4 && argc != 5) { cerr << "syntax: statistics extract lex phrase-table [inverse]\n"; exit(1); } char* &fileNameExtract = argv[1]; char* &fileNameLex = argv[2]; char* &fileNamePhraseTable = argv[3]; inverseFlag = false; if (argc > 4) { inverseFlag = true; cerr << "using inverse mode\n"; } // lexical translation table lexTable.load( fileNameLex ); // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table phraseTableFile.open(fileNamePhraseTable); if (phraseTableFile.fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } // loop through all extracted phrase translations int lastForeign = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; string line; while(getline(extractFileP, line)) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; PhraseAlignment phrasePair; bool isPhrasePair = phrasePair.create( line.c_str(), i ); if (lastForeign >= 0 && lastForeign != phrasePair.foreign) { processPhrasePairs( phrasePairsWithSameF ); for(size_t j=0; j<phrasePairsWithSameF.size(); j++) phrasePairsWithSameF[j].clear(); phrasePairsWithSameF.clear(); phraseTableE.clear(); phraseTableF.clear(); phrasePair.clear(); // process line again, since phrase tables flushed phrasePair.create( line.c_str(), i ); phrasePairBase = 0; } lastForeign = phrasePair.foreign; if (isPhrasePair) phrasePairsWithSameF.push_back( phrasePair ); else phrasePairBase++; } processPhrasePairs( phrasePairsWithSameF ); phraseTableFile.close(); }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--SourceSyntax] [--TargetSyntax] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; char* fileNameCountOfCounts; char* fileNameFunctionWords; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { sourceSyntaxFlag = true; cerr << "using source syntax\n"; } else if (strcmp(argv[i],"--TargetSyntax") == 0) { targetSyntaxFlag = true; cerr << "using target syntax\n"; } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Good Turing discounting!\n"; exit(1); } fileNameCountOfCounts = argv[++i]; cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameCountOfCounts = argv[++i]; cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n"; } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) { unalignedFlag = true; cerr << "using unaligned word penalty\n"; } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) { unalignedFWFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameFunctionWords = argv[++i]; cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { outputNTLengths = true; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // is model string-to-tree? perhaps confusingly, this always refers to the // forward direction. same goes for sourceSyntaxFlag and targetSyntaxFlag. stringToTreeFlag = (!sourceSyntaxFlag && targetSyntaxFlag); // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // function word list if (unalignedFWFlag) loadFunctionWords( fileNameFunctionWords ); // compute count of counts for Good Turing discounting if (goodTuringFlag || kneserNeyFlag) { for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0; } // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table ostream *phraseTableFile; if (strcmp(fileNamePhraseTable, "-") == 0) { phraseTableFile = &cout; } else { ofstream *outputFile = new ofstream(); outputFile->open(fileNamePhraseTable); if (outputFile->fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations float lastCount = 0.0f; float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i ); lastCount = phrasePair.count; lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } // if new source phrase, process last batch if (lastPhrasePair != NULL && lastPhrasePair->GetSource() != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); phrasePairsWithSameF.clear(); lastPhrasePair = NULL; } // add phrase pairs to list, it's now the last one phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF.back(); } processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); phraseTableFile->flush(); if (phraseTableFile != &cout) { (dynamic_cast<ofstream*>(phraseTableFile))->close(); delete phraseTableFile; } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { writeCountOfCounts( fileNameCountOfCounts ); } }
// check if two word alignments between a phrase pairs "match" // i.e. they do not differ in the alignment of non-termimals bool PhraseAlignment::match( const PhraseAlignment& other ) { if (this == &other) return true; if(other.GetRuleId() != this->GetRuleId()) return false; return true; }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; for(int i=4;i<argc;i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts( fileNameExtract ); // sorted phrase extraction file ifstream extractFile; extractFile.open(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table phraseTableFile.open(fileNamePhraseTable); if (phraseTableFile.fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (lastSource > 0 && strcmp(line,lastLine) == 0) { lastPhrasePair->addToCount( line ); continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i ); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF ); for(int j=0;j<phrasePairsWithSameF.size();j++) phrasePairsWithSameF[j].clear(); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create( line, i ); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.GetSource(); phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs( phrasePairsWithSameF ); phraseTableFile.close(); }
void computeCountOfCounts( char* fileNameExtract ) { cerr << "computing counts of counts"; for(int i=1;i<=GT_MAX;i++) countOfCounts[i] = 0; ifstream extractFile; extractFile.open( fileNameExtract ); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // loop through all extracted phrase translations int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->addToCount( line ); continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment *phrasePair = new PhraseAlignment(); phrasePair->create( line, i ); if (i == 1) { lastPhrasePair = phrasePair; continue; } // only differs in count? just add count if (lastPhrasePair->match( *phrasePair )) { lastPhrasePair->count += phrasePair->count; phrasePair->clear(); delete(phrasePair); continue; } // periodically house cleaning if (phrasePair->GetSource() != lastPhrasePair->GetSource()) { phraseTableT.clear(); // these would get too big phraseTableS.clear(); // these would get too big // process line again, since phrase tables flushed phrasePair->clear(); phrasePair->create( line, i ); } int count = lastPhrasePair->count + 0.99999; if(count <= GT_MAX) countOfCounts[ count ]++; lastPhrasePair->clear(); delete( lastPhrasePair ); lastPhrasePair = phrasePair; } delete lastPhrasePair; discountFactor[0] = 0.01; // floor cerr << "\n"; for(int i=1;i<GT_MAX;i++) { discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)); cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i]; // some smoothing... if (discountFactor[i]>1) discountFactor[i] = 1; if (discountFactor[i]<discountFactor[i-1]) discountFactor[i] = discountFactor[i-1]; cerr << " -> " << discountFactor[i]*i << endl; } }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n"; exit(1); } string fileNameExtract = argv[1]; string fileNameLex = argv[2]; string fileNamePhraseTable = argv[3]; string fileNameCountOfCounts; char* fileNameFunctionWords = NULL; char* fileNameDomain = NULL; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { unpairedExtractFormatFlag = true; cerr << "processing unpaired extract format\n"; } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { conditionOnTargetLhsFlag = true; cerr << "processing unpaired extract format\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n"; } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) { unalignedFlag = true; cerr << "using unaligned word penalty\n"; } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) { unalignedFWFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameFunctionWords = argv[++i]; cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl; } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 || strcmp(argv[i],"--SparseDomainRatio") == 0 || strcmp(argv[i],"--SparseDomainSubset") == 0 || strcmp(argv[i],"--DomainIndicator") == 0 || strcmp(argv[i],"--DomainRatio") == 0 || strcmp(argv[i],"--DomainSubset") == 0) { includeSentenceIdFlag = true; domainFlag = true; domainSparseFlag = strstr( argv[i], "Sparse" ); domainRatioFlag = strstr( argv[i], "Ratio" ); domainSubsetFlag = strstr( argv[i], "Subset" ); if (i+1==argc) { cerr << "ERROR: specify domain info file with " << argv[i] << endl; exit(1); } fileNameDomain = argv[++i]; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { outputNTLengths = true; } else if (strcmp(argv[i],"--Singleton") == 0) { singletonFeature = true; cerr << "binary singleton feature\n"; } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) { crossedNonTerm = true; cerr << "crossed non-term reordering feature\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // function word list if (unalignedFWFlag) loadFunctionWords( fileNameFunctionWords ); // load domain information if (domainFlag) { if (inverseFlag) { domainFlag = false; includeSentenceIdFlag = false; } else { domain = new Domain; domain->load( fileNameDomain ); } } // compute count of counts for Good Turing discounting if (goodTuringFlag || kneserNeyFlag) { for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0; } // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table ostream *phraseTableFile; if (fileNamePhraseTable == "-") { phraseTableFile = &cout; } else { Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); bool success = outputFile->Open(fileNamePhraseTable); if (!success) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations float lastCount = 0.0f; float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; bool isSingleton = true; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i, includeSentenceIdFlag ); lastCount = phrasePair.count; lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair ) && (!domainFlag || domain->getDomainOfSentence( lastPhrasePair->sentenceId ) == domain->getDomainOfSentence( phrasePair.sentenceId ) )) { lastPhrasePair->count += phrasePair.count; lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } // if new source phrase, process last batch if (lastPhrasePair != NULL && lastPhrasePair->GetSource() != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton ); phrasePairsWithSameF.clear(); isSingleton = false; lastPhrasePair = NULL; } else { isSingleton = true; } // add phrase pairs to list, it's now the last one phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF.back(); } processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton ); phraseTableFile->flush(); if (phraseTableFile != &cout) { delete phraseTableFile; } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { writeCountOfCounts( fileNameCountOfCounts ); } }
int main(int argc, char* argv[]) { cerr << "Score v2.5 written by Philipp Koehn" << endl << "Modified by Ventsislav Zhechev, Autodesk Development Sàrl" << endl << "scoring methods for extracted rules" << endl ; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; char* fileNameWordAlignment; for(int i=4; i<argc; ++i) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; fileNameWordAlignment = argv[++i]; cerr << "outputing word alignment in file " << fileNameWordAlignment << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load(fileNameLex); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts(fileNameExtract); // sorted phrase extraction file Bz2LineReader extractFile(fileNameExtract); // output file: phrase translation table Bz2LineWriter phraseTableFile(fileNamePhraseTable); // output word alignment file if (!inverseFlag && wordAlignmentFlag) { wordAlignmentFile.open(fileNameWordAlignment); if (wordAlignmentFile.fail()) { cerr << "ERROR: could not open word alignment file " << fileNameWordAlignment << endl; exit(1); } } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; string lastLine = ""; PhraseAlignment *lastPhrasePair = NULL; for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) { if (line.empty()) break; if ((++i)%10000000 == 0) cerr << "[p. score:" << i << "]" << flush; else if (i % 100000 == 0) cerr << "." << flush; // identical to last line? just add count if (lastSource >= 0 && line == lastLine) { lastPhrasePair->addToCount(line); continue; } lastLine = line; // create new phrase pair PhraseAlignment phrasePair; vector<string> lineVector = tokenize(line.c_str()); phrasePair.create(lineVector, i); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals(phrasePair)) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.source) { processPhrasePairs(phrasePairsWithSameF, phraseTableFile); for (size_t j=0; j<phrasePairsWithSameF.size(); phrasePairsWithSameF[j++].clear()); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create(lineVector, i); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.source; phrasePairsWithSameF.push_back(phrasePair); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs(phrasePairsWithSameF, phraseTableFile); if (!inverseFlag && wordAlignmentFlag) wordAlignmentFile.close(); }
void computeCountOfCounts(const string& fileNameExtract) { if (fileNameExtract == "-") { cerr << "The ‘GoodTuring Discounting’ option may not be used with piped input!" << endl; exit(9); } cerr << "computing counts of counts"; for (size_t i=1; i<=GT_MAX; countOfCounts[i++] = 0); Bz2LineReader extractFile(fileNameExtract); // loop through all extracted phrase translations int i=0; string lastLine; PhraseAlignment *lastPhrasePair = NULL; for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) { if (line.empty()) break; if ((++i)%10000000 == 0) cerr << "[" << i << "]" << endl; else if (i % 100000 == 0) cerr << "," << flush; // identical to last line? just add count if (line == lastLine) { lastPhrasePair->addToCount(line); continue; } lastLine = line; // create new phrase pair PhraseAlignment *phrasePair = new PhraseAlignment(); vector<string> lineVector = tokenize(line.c_str()); phrasePair->create(lineVector, i); if (i == 1) { lastPhrasePair = phrasePair; continue; } // only differs in count? just add count if (lastPhrasePair->match( *phrasePair )) { lastPhrasePair->count += phrasePair->count; phrasePair->clear(); delete(phrasePair); continue; } // periodically house cleaning if (phrasePair->source != lastPhrasePair->source) { phraseTableT.clear(); // these would get too big phraseTableS.clear(); // these would get too big // process line again, since phrase tables flushed phrasePair->clear(); phrasePair->create(lineVector, i); } int count = lastPhrasePair->count + 0.99999; if(count <= GT_MAX) ++countOfCounts[ count ]; lastPhrasePair->clear(); delete( lastPhrasePair ); lastPhrasePair = phrasePair; } discountFactor[0] = 0.01; // floor cerr << "\n"; for(int i=1;i<GT_MAX; ++i) { discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)); cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i]; // some smoothing... if (discountFactor[i]>1) discountFactor[i] = 1; if (discountFactor[i]<discountFactor[i-1]) discountFactor[i] = discountFactor[i-1]; cerr << " -> " << discountFactor[i]*i << endl; } }