int main(int argc, char* argv[]) { cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n" << "modifying PhraseScore v1.4 written by Philipp Koehn\n" << "It computes statistics for extracted phrase pairs\n" << "if (direct):\n" << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n" << "if (inverse)\n" << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n"; if (argc != 4 && argc != 5) { cerr << "syntax: statistics extract lex phrase-table [inverse]\n"; exit(1); } char* &fileNameExtract = argv[1]; char* &fileNameLex = argv[2]; char* &fileNamePhraseTable = argv[3]; inverseFlag = false; if (argc > 4) { inverseFlag = true; cerr << "using inverse mode\n"; } // lexical translation table lexTable.load( fileNameLex ); // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table phraseTableFile.open(fileNamePhraseTable); if (phraseTableFile.fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } // loop through all extracted phrase translations int lastForeign = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; string line; while(getline(extractFileP, line)) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; PhraseAlignment phrasePair; bool isPhrasePair = phrasePair.create( line.c_str(), i ); if (lastForeign >= 0 && lastForeign != phrasePair.foreign) { processPhrasePairs( phrasePairsWithSameF ); for(size_t j=0; j<phrasePairsWithSameF.size(); j++) phrasePairsWithSameF[j].clear(); phrasePairsWithSameF.clear(); phraseTableE.clear(); phraseTableF.clear(); phrasePair.clear(); // process line again, since phrase tables flushed phrasePair.create( line.c_str(), i ); phrasePairBase = 0; } lastForeign = phrasePair.foreign; if (isPhrasePair) phrasePairsWithSameF.push_back( phrasePair ); else phrasePairBase++; } processPhrasePairs( phrasePairsWithSameF ); phraseTableFile.close(); }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; for(int i=4;i<argc;i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts( fileNameExtract ); // sorted phrase extraction file ifstream extractFile; extractFile.open(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table phraseTableFile.open(fileNamePhraseTable); if (phraseTableFile.fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (lastSource > 0 && strcmp(line,lastLine) == 0) { lastPhrasePair->addToCount( line ); continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i ); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF ); for(int j=0;j<phrasePairsWithSameF.size();j++) phrasePairsWithSameF[j].clear(); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create( line, i ); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.GetSource(); phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs( phrasePairsWithSameF ); phraseTableFile.close(); }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--SourceSyntax] [--TargetSyntax] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; char* fileNameCountOfCounts; char* fileNameFunctionWords; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { sourceSyntaxFlag = true; cerr << "using source syntax\n"; } else if (strcmp(argv[i],"--TargetSyntax") == 0) { targetSyntaxFlag = true; cerr << "using target syntax\n"; } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Good Turing discounting!\n"; exit(1); } fileNameCountOfCounts = argv[++i]; cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameCountOfCounts = argv[++i]; cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n"; } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) { unalignedFlag = true; cerr << "using unaligned word penalty\n"; } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) { unalignedFWFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameFunctionWords = argv[++i]; cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { outputNTLengths = true; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // is model string-to-tree? perhaps confusingly, this always refers to the // forward direction. same goes for sourceSyntaxFlag and targetSyntaxFlag. stringToTreeFlag = (!sourceSyntaxFlag && targetSyntaxFlag); // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // function word list if (unalignedFWFlag) loadFunctionWords( fileNameFunctionWords ); // compute count of counts for Good Turing discounting if (goodTuringFlag || kneserNeyFlag) { for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0; } // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table ostream *phraseTableFile; if (strcmp(fileNamePhraseTable, "-") == 0) { phraseTableFile = &cout; } else { ofstream *outputFile = new ofstream(); outputFile->open(fileNamePhraseTable); if (outputFile->fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations float lastCount = 0.0f; float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i ); lastCount = phrasePair.count; lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } // if new source phrase, process last batch if (lastPhrasePair != NULL && lastPhrasePair->GetSource() != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); phrasePairsWithSameF.clear(); lastPhrasePair = NULL; } // add phrase pairs to list, it's now the last one phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF.back(); } processPhrasePairs( phrasePairsWithSameF, *phraseTableFile ); phraseTableFile->flush(); if (phraseTableFile != &cout) { (dynamic_cast<ofstream*>(phraseTableFile))->close(); delete phraseTableFile; } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { writeCountOfCounts( fileNameCountOfCounts ); } }
int main(int argc, char* argv[]) { cerr << "Score v2.5 written by Philipp Koehn" << endl << "Modified by Ventsislav Zhechev, Autodesk Development Sàrl" << endl << "scoring methods for extracted rules" << endl ; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; char* fileNameWordAlignment; for(int i=4; i<argc; ++i) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; fileNameWordAlignment = argv[++i]; cerr << "outputing word alignment in file " << fileNameWordAlignment << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load(fileNameLex); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts(fileNameExtract); // sorted phrase extraction file Bz2LineReader extractFile(fileNameExtract); // output file: phrase translation table Bz2LineWriter phraseTableFile(fileNamePhraseTable); // output word alignment file if (!inverseFlag && wordAlignmentFlag) { wordAlignmentFile.open(fileNameWordAlignment); if (wordAlignmentFile.fail()) { cerr << "ERROR: could not open word alignment file " << fileNameWordAlignment << endl; exit(1); } } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; string lastLine = ""; PhraseAlignment *lastPhrasePair = NULL; for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) { if (line.empty()) break; if ((++i)%10000000 == 0) cerr << "[p. score:" << i << "]" << flush; else if (i % 100000 == 0) cerr << "." << flush; // identical to last line? just add count if (lastSource >= 0 && line == lastLine) { lastPhrasePair->addToCount(line); continue; } lastLine = line; // create new phrase pair PhraseAlignment phrasePair; vector<string> lineVector = tokenize(line.c_str()); phrasePair.create(lineVector, i); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals(phrasePair)) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.source) { processPhrasePairs(phrasePairsWithSameF, phraseTableFile); for (size_t j=0; j<phrasePairsWithSameF.size(); phrasePairsWithSameF[j++].clear()); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create(lineVector, i); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.source; phrasePairsWithSameF.push_back(phrasePair); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs(phrasePairsWithSameF, phraseTableFile); if (!inverseFlag && wordAlignmentFlag) wordAlignmentFile.close(); }