int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--SourceSyntax] [--TargetSyntax] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; char* fileNameCountOfCounts; char* fileNameFunctionWords; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { sourceSyntaxFlag = true; cerr << "using source syntax\n"; } else if (strcmp(argv[i],"--TargetSyntax") == 0) { targetSyntaxFlag = true; cerr << "using target syntax\n"; } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Good Turing discounting!\n"; exit(1); } fileNameCountOfCounts = argv[++i]; cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameCountOfCounts = argv[++i]; cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n"; } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) { unalignedFlag = true; cerr << "using unaligned word penalty\n"; } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) { unalignedFWFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameFunctionWords = argv[++i]; cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { outputNTLengths = true; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // is model string-to-tree? perhaps confusingly, this always refers to the // forward direction. same goes for sourceSyntaxFlag and targetSyntaxFlag. stringToTreeFlag = (!sourceSyntaxFlag && targetSyntaxFlag); // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // function word list if (unalignedFWFlag) loadFunctionWords( fileNameFunctionWords ); // compute count of counts for Good Turing discounting if (goodTuringFlag || kneserNeyFlag) { for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0; } // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table ostream *phraseTableFile; if (strcmp(fileNamePhraseTable, "-") == 0) { phraseTableFile = &cout; } else { ofstream *outputFile = new ofstream(); outputFile->open(fileNamePhraseTable); if (outputFile->fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations float lastCount = 0.0f; float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i ); lastCount = phrasePair.count; lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } // if new source phrase, process last batch if (lastPhrasePair != NULL && lastPhrasePair->GetSource() != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); phrasePairsWithSameF.clear(); lastPhrasePair = NULL; } // add phrase pairs to list, it's now the last one phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF.back(); } processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); phraseTableFile->flush(); if (phraseTableFile != &cout) { (dynamic_cast<ofstream*>(phraseTableFile))->close(); delete phraseTableFile; } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { writeCountOfCounts( fileNameCountOfCounts ); } }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n"; exit(1); } string fileNameExtract = argv[1]; string fileNameLex = argv[2]; string fileNamePhraseTable = argv[3]; string fileNameCountOfCounts; char* fileNameFunctionWords = NULL; char* fileNameDomain = NULL; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { unpairedExtractFormatFlag = true; cerr << "processing unpaired extract format\n"; } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { conditionOnTargetLhsFlag = true; cerr << "processing unpaired extract format\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n"; } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) { unalignedFlag = true; cerr << "using unaligned word penalty\n"; } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) { unalignedFWFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameFunctionWords = argv[++i]; cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl; } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 || strcmp(argv[i],"--SparseDomainRatio") == 0 || strcmp(argv[i],"--SparseDomainSubset") == 0 || strcmp(argv[i],"--DomainIndicator") == 0 || strcmp(argv[i],"--DomainRatio") == 0 || strcmp(argv[i],"--DomainSubset") == 0) { includeSentenceIdFlag = true; domainFlag = true; domainSparseFlag = strstr( argv[i], "Sparse" ); domainRatioFlag = strstr( argv[i], "Ratio" ); domainSubsetFlag = strstr( argv[i], "Subset" ); if (i+1==argc) { cerr << "ERROR: specify domain info file with " << argv[i] << endl; exit(1); } fileNameDomain = argv[++i]; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { outputNTLengths = true; } else if (strcmp(argv[i],"--Singleton") == 0) { singletonFeature = true; cerr << "binary singleton feature\n"; } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) { crossedNonTerm = true; cerr << "crossed non-term reordering feature\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // function word list if (unalignedFWFlag) loadFunctionWords( fileNameFunctionWords ); // load domain information if (domainFlag) { if (inverseFlag) { domainFlag = false; includeSentenceIdFlag = false; } else { domain = new Domain; domain->load( fileNameDomain ); } } // compute count of counts for Good Turing discounting if (goodTuringFlag || kneserNeyFlag) { for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0; } // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table ostream *phraseTableFile; if (fileNamePhraseTable == "-") { phraseTableFile = &cout; } else { Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); bool success = outputFile->Open(fileNamePhraseTable); if (!success) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations float lastCount = 0.0f; float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; bool isSingleton = true; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i, includeSentenceIdFlag ); lastCount = phrasePair.count; lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair ) && (!domainFlag || domain->getDomainOfSentence( lastPhrasePair->sentenceId ) == domain->getDomainOfSentence( phrasePair.sentenceId ) )) { lastPhrasePair->count += phrasePair.count; lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } // if new source phrase, process last batch if (lastPhrasePair != NULL && lastPhrasePair->GetSource() != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton ); phrasePairsWithSameF.clear(); isSingleton = false; lastPhrasePair = NULL; } else { isSingleton = true; } // add phrase pairs to list, it's now the last one phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF.back(); } processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton ); phraseTableFile->flush(); if (phraseTableFile != &cout) { delete phraseTableFile; } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { writeCountOfCounts( fileNameCountOfCounts ); } }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; for(int i=4;i<argc;i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts( fileNameExtract ); // sorted phrase extraction file ifstream extractFile; extractFile.open(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table phraseTableFile.open(fileNamePhraseTable); if (phraseTableFile.fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (lastSource > 0 && strcmp(line,lastLine) == 0) { lastPhrasePair->addToCount( line ); continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i ); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF ); for(int j=0;j<phrasePairsWithSameF.size();j++) phrasePairsWithSameF[j].clear(); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create( line, i ); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.GetSource(); phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs( phrasePairsWithSameF ); phraseTableFile.close(); }
int main(int argc, char* argv[]) { cerr << "Score v2.5 written by Philipp Koehn" << endl << "Modified by Ventsislav Zhechev, Autodesk Development Sàrl" << endl << "scoring methods for extracted rules" << endl ; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; char* fileNameWordAlignment; for(int i=4; i<argc; ++i) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; fileNameWordAlignment = argv[++i]; cerr << "outputing word alignment in file " << fileNameWordAlignment << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load(fileNameLex); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts(fileNameExtract); // sorted phrase extraction file Bz2LineReader extractFile(fileNameExtract); // output file: phrase translation table Bz2LineWriter phraseTableFile(fileNamePhraseTable); // output word alignment file if (!inverseFlag && wordAlignmentFlag) { wordAlignmentFile.open(fileNameWordAlignment); if (wordAlignmentFile.fail()) { cerr << "ERROR: could not open word alignment file " << fileNameWordAlignment << endl; exit(1); } } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; string lastLine = ""; PhraseAlignment *lastPhrasePair = NULL; for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) { if (line.empty()) break; if ((++i)%10000000 == 0) cerr << "[p. score:" << i << "]" << flush; else if (i % 100000 == 0) cerr << "." << flush; // identical to last line? just add count if (lastSource >= 0 && line == lastLine) { lastPhrasePair->addToCount(line); continue; } lastLine = line; // create new phrase pair PhraseAlignment phrasePair; vector<string> lineVector = tokenize(line.c_str()); phrasePair.create(lineVector, i); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals(phrasePair)) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.source) { processPhrasePairs(phrasePairsWithSameF, phraseTableFile); for (size_t j=0; j<phrasePairsWithSameF.size(); phrasePairsWithSameF[j++].clear()); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create(lineVector, i); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.source; phrasePairsWithSameF.push_back(phrasePair); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs(phrasePairsWithSameF, phraseTableFile); if (!inverseFlag && wordAlignmentFlag) wordAlignmentFile.close(); }